Last active
February 7, 2019 03:29
-
-
Save koshigoe/9efdda79b37c0ab8dae843aa38c18b35 to your computer and use it in GitHub Desktop.
サロゲートペアなどを途中で切り詰めない様にしたい
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'benchmark' | |
N = 10 | |
str = "がぎぐげご" * 100 | |
gra = "か\u3099き\u3099く\u3099け\u3099こ\u3099" * 100 | |
Benchmark.bm(64) do |x| | |
x.report('String#size (1)') do |x| | |
N.times { str.size } | |
end | |
x.report('String#size (2)') do |x| | |
N.times { gra.size } | |
end | |
x.report('ActiveSupport::Multibyte::Unicode.unpack_graphemes (1)') do | |
N.times { ActiveSupport::Multibyte::Unicode.unpack_graphemes(str) } | |
end | |
x.report('ActiveSupport::Multibyte::Unicode.unpack_graphemes (2)') do | |
N.times { ActiveSupport::Multibyte::Unicode.unpack_graphemes(gra) } | |
end | |
x.report('String#scan.size (1)') do | |
N.times { str.scan(/\X/).size } | |
end | |
x.report('String#scan.size (2)') do | |
N.times { gra.scan(/\X/).size } | |
end | |
x.report('String#grapheme_clusters.size (1)') do | |
N.times { str.grapheme_clusters.size } | |
end | |
x.report('String#grapheme_clusters.size (2)') do | |
N.times { gra.grapheme_clusters.size } | |
end | |
x.report('String#each_grapheme_cluster.count (1)') do | |
N.times { str.each_grapheme_cluster.count } | |
end | |
x.report('String#each_grapheme_cluster.count (2)') do | |
N.times { gra.each_grapheme_cluster.count } | |
end | |
x.report('String#each_grapheme_cluster.size (1)') do | |
N.times { str.each_grapheme_cluster.size } | |
end | |
x.report('String#each_grapheme_cluster.size (2)') do | |
N.times { gra.each_grapheme_cluster.size } | |
end | |
end | |
__END__ | |
user system total real | |
String#size (1) 0.000018 0.000008 0.000026 ( 0.000022) | |
String#size (2) 0.000012 0.000002 0.000014 ( 0.000012) | |
ActiveSupport::Multibyte::Unicode.unpack_graphemes (1) 2.384050 0.043771 2.427821 ( 2.440785) | |
ActiveSupport::Multibyte::Unicode.unpack_graphemes (2) 4.380538 0.015965 4.396503 ( 4.457205) | |
String#scan.size (1) 0.005429 0.000144 0.005573 ( 0.005697) | |
String#scan.size (2) 0.006267 0.000076 0.006343 ( 0.006353) | |
String#grapheme_clusters.size (1) 0.004079 0.000104 0.004183 ( 0.004222) | |
String#grapheme_clusters.size (2) 0.003742 0.000058 0.003800 ( 0.003826) | |
String#each_grapheme_cluster.count (1) 0.004119 0.000023 0.004142 ( 0.004185) | |
String#each_grapheme_cluster.count (2) 0.003672 0.000006 0.003678 ( 0.003688) | |
String#each_grapheme_cluster.size (1) 0.003329 0.000010 0.003339 ( 0.003425) | |
String#each_grapheme_cluster.size (2) 0.003561 0.000017 0.003578 ( 0.003624) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'benchmark' | |
N = 1_000 | |
str = "がぎぐげご" * 100 | |
gra = "か\u3099き\u3099く\u3099け\u3099こ\u3099" * 100 | |
Benchmark.bm(80) do |x| | |
x.report('String#truncate (1)') do | |
N.times { str.truncate(100, omission: "...") } | |
end | |
x.report('String#truncate (2)') do | |
N.times { str.truncate(100, omission: "...", separator: "ぐ") } | |
end | |
x.report('String#truncate (3)') do | |
N.times { gra.truncate(100, omission: "\u{2702 fe0f}") } | |
end | |
x.report('String#truncate (4)') do | |
N.times { gra.truncate(100, omission: "\u{2702 fe0f}", separator: "く\u3099") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (1)') do | |
N.times { str.truncate_graphemes(100, omission: "...") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (2)') do | |
N.times { str.truncate_graphemes(100, omission: "...", separator: "ぐ") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (3)') do | |
N.times { gra.truncate_graphemes(100, omission: "\u{2702 fe0f}") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (4)') do | |
N.times { gra.truncate_graphemes(100, omission: "\u{2702 fe0f}", separator: "く\u3099") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (1)') do | |
N.times { str.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "...") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (2)') do | |
N.times { str.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "...", separator: "ぐ") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (3)') do | |
N.times { gra.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "\u{2702 fe0f}") } | |
end | |
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (4)') do | |
N.times { gra.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "\u{2702 fe0f}", separator: "く\u3099") } | |
end | |
x.report('String#truncate_graphemes_with_grapheme_clusters_size (1)') do | |
N.times { str.truncate_graphemes_with_grapheme_clusters_size(100, omission: "...") } | |
end | |
x.report('String#truncate_graphemes_with_grapheme_clusters_size (2)') do | |
N.times { str.truncate_graphemes_with_grapheme_clusters_size(100, omission: "...", separator: "ぐ") } | |
end | |
x.report('String#truncate_graphemes_with_grapheme_clusters_size (3)') do | |
N.times { gra.truncate_graphemes_with_grapheme_clusters_size(100, omission: "\u{2702 fe0f}") } | |
end | |
x.report('String#truncate_graphemes_with_grapheme_clusters_size (4)') do | |
N.times { gra.truncate_graphemes_with_grapheme_clusters_size(100, omission: "\u{2702 fe0f}", separator: "く\u3099") } | |
end | |
x.report('String#gsub (1)') do | |
N.times { gra.gsub(/./, '.') } | |
end | |
x.report('String#gsub (2)') do | |
N.times { gra.gsub(/\X/, '.') } | |
end | |
end | |
__END__ | |
user system total real | |
String#truncate (1) 0.001988 0.000649 0.002637 ( 0.002635) | |
String#truncate (2) 0.003430 0.000627 0.004057 ( 0.004072) | |
String#truncate (3) 0.002081 0.000568 0.002649 ( 0.002652) | |
String#truncate (4) 0.004323 0.000713 0.005036 ( 0.005125) | |
String#truncate_graphemes_with_each_grapheme_cluster_count (1) 0.496839 0.035412 0.532251 ( 0.544658) | |
String#truncate_graphemes_with_each_grapheme_cluster_count (2) 0.546470 0.007729 0.554199 ( 0.568006) | |
String#truncate_graphemes_with_each_grapheme_cluster_count (3) 0.431042 0.001793 0.432835 ( 0.433713) | |
String#truncate_graphemes_with_each_grapheme_cluster_count (4) 0.422881 0.000698 0.423579 ( 0.424463) | |
String#truncate_graphemes_with_each_grapheme_cluster_size (1) 0.366217 0.004378 0.370595 ( 0.371266) | |
String#truncate_graphemes_with_each_grapheme_cluster_size (2) 0.527039 0.013160 0.540199 ( 0.550277) | |
String#truncate_graphemes_with_each_grapheme_cluster_size (3) 0.402696 0.010364 0.413060 ( 0.418723) | |
String#truncate_graphemes_with_each_grapheme_cluster_size (4) 0.390556 0.010561 0.401117 ( 0.401974) | |
String#truncate_graphemes_with_grapheme_clusters_size (1) 0.420647 0.014305 0.434952 ( 0.435687) | |
String#truncate_graphemes_with_grapheme_clusters_size (2) 0.527312 0.010127 0.537439 ( 0.544975) | |
String#truncate_graphemes_with_grapheme_clusters_size (3) 0.478933 0.006017 0.484950 ( 0.488538) | |
String#truncate_graphemes_with_grapheme_clusters_size (4) 0.426145 0.001380 0.427525 ( 0.428163) | |
String#gsub (1) 0.149356 0.000599 0.149955 ( 0.150370) | |
String#gsub (2) 0.290007 0.001179 0.291186 ( 0.291967) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class String | |
# 自分自身を指定された長さ(書記素クラスタ単位)で切り詰める。 | |
# | |
# @see https://github.com/rails/rails/blob/bfd296dda797e597e8a54709d1cd331cdffaa9f7/activesupport/lib/active_support/core_ext/string/filters.rb#L48-L79 | |
# @see https://github.com/rails/rails/blob/9cc463ed7b7be098602b72a98f72220ea6466ba2/activesupport/lib/active_support/core_ext/string/filters.rb#L81-L120 | |
# | |
def truncate_graphemes(truncate_at, options = {}) | |
return dup if each_grapheme_cluster.size <= truncate_at | |
omission = options[:omission] || '...' | |
length_with_room_for_omission = truncate_at - omission.each_grapheme_cluster.size | |
truncated = self[/\X{#{length_with_room_for_omission}}/] | |
if options[:separator] && stop = truncated.rindex(options[:separator]) | |
truncated = truncated[0, stop] | |
end | |
"#{truncated}#{omission}" | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
RSpec.describe String do | |
describe '#truncate_graphemes' do | |
context 'omission 未指定(default)' do | |
context '切り詰め位置が切り詰め対象文字列の長さ(書記素クラスタ単位)未満' do | |
context '切り詰め位置が omission (...) の長さ(書記素クラスタ単位)以下' do | |
it 'omission だけを返す' do | |
# ààààà | |
original = "\u{0061 0300 0061 0300 0061 0300 0061 0300 0061 0300}" | |
expect(original.truncate_graphemes(0)).to eq '...' | |
expect(original.truncate_graphemes(3)).to eq '...' | |
end | |
end | |
context '切り詰め位置が omission (...) の長さ(書記素クラスタ単位)より大きい' do | |
it '書記素クラスタ単位で切り詰めた結果に omission を付与する' do | |
# ààààà | |
original = "\u{0061 0300}" * 5 | |
expect(original.truncate_graphemes(4)).to eq "\u{0061 0300}..." | |
end | |
end | |
end | |
context '切り詰め位置が切り詰め対象文字列の長さ(書記素クラスタ単位)以上' do | |
it '切り詰めない' do | |
# ààààà | |
original = "\u{0061 0300}" * 5 | |
expect(original.truncate_graphemes(5)).to eq original | |
expect(original.truncate_graphemes(6)).to eq original | |
end | |
end | |
end | |
context 'omission にサロゲートペアを含む' do | |
it 'omission の長さも書記素クラスタ単位で数えた上で切り詰めを行う' do | |
# ààààà | |
original = "\u{0061 0300}" * 5 | |
# è | |
omission = "\u{0065 0300}" | |
expect(original.truncate_graphemes(3, omission: omission)).to eq "\u{0061 0300 0061 0300 0065 0300}" | |
end | |
end | |
context 'separator 指定あり(サロゲートペアを含む)' do | |
context '切り詰め対象文字列が separator を含む' do | |
context '切り詰め位置が separator である' do | |
it 'その位置で切り詰める' do | |
# è | |
separator = "\u{0065 0300}" | |
# àààèàààèààà | |
original = Array.new(3, "\u{0061 0300}" * 3).join(separator) | |
truncated = original.truncate_graphemes(8, separator: separator, omission: '') | |
expect(truncated).to eq "\u{0061 0300 0061 0300 0061 0300 0065 0300 0061 0300 0061 0300 0061 0300}" | |
end | |
end | |
context '切り詰め位置が separator でない' do | |
it '一つ前の separator 直前で切り詰める' do | |
# è | |
separator = "\u{0065 0300}" | |
# àààèàààèààà | |
original = Array.new(3, "\u{0061 0300}" * 3).join(separator) | |
truncated = original.truncate_graphemes(7, separator: separator, omission: '') | |
expect(truncated).to eq "\u{0061 0300 0061 0300 0061 0300}" | |
end | |
end | |
end | |
context '切り詰め対象文字が separator を含まない' do | |
it '切り詰める' do | |
# ààààà | |
original = "\u{0061 0300}" * 5 | |
# è | |
separator = "\u{0065 0300}" | |
truncated = original.truncate_graphemes(3, separator: separator, omission: '') | |
expect(truncated).to eq "\u{0061 0300 0061 0300 0061 0300}" | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment