Created
September 16, 2014 19:00
-
-
Save avdi/1b85c8673d72635cd967 to your computer and use it in GitHub Desktop.
Stripping indentation from heredocs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
TEXT = <<EOF | |
See, the interesting thing about this text | |
is that while it seems like the first line defines an indent | |
it's actually the last line which has the smallest indent | |
there are also some blank lines | |
both with and without extra spaces in them | |
and it just goes on and on | |
this text | |
and starts to repeat itself | |
See, the interesting thing about this text | |
is that while it seems like the first line defines an indent | |
it's actually the last line which has the smallest indent | |
there are also some blank lines | |
both with and without extra spaces in them | |
and it just goes on and on | |
this text | |
and starts to repeat itself | |
See, the interesting thing about this text | |
is that while it seems like the first line defines an indent | |
it's actually the last line which has the smallest indent | |
there are also some blank lines | |
both with and without extra spaces in them | |
and it just goes on and on | |
this text | |
and starts to repeat itself | |
The End. | |
EOF | |
EXPECTED_TEXT = <<EOF | |
See, the interesting thing about this text | |
is that while it seems like the first line defines an indent | |
it's actually the last line which has the smallest indent | |
there are also some blank lines | |
both with and without extra spaces in them | |
and it just goes on and on | |
this text | |
and starts to repeat itself | |
See, the interesting thing about this text | |
is that while it seems like the first line defines an indent | |
it's actually the last line which has the smallest indent | |
there are also some blank lines | |
both with and without extra spaces in them | |
and it just goes on and on | |
this text | |
and starts to repeat itself | |
See, the interesting thing about this text | |
is that while it seems like the first line defines an indent | |
it's actually the last line which has the smallest indent | |
there are also some blank lines | |
both with and without extra spaces in them | |
and it just goes on and on | |
this text | |
and starts to repeat itself | |
The End. | |
EOF | |
require "minitest/autorun" | |
require "minitest" | |
require "active_support/core_ext/string" | |
require "unindent" | |
class String | |
# Try to optimize by avoiding splitting into an array | |
def unindent_scan | |
indent_str = nil | |
scan(/^[\t ]*(?=\S)/) do |s| | |
indent_str ||= s | |
indent_str = (s.size < indent_str.size) ? s : indent_str | |
end | |
if indent_str | |
gsub(/^#{indent_str}/, "") | |
end | |
end | |
# This version tries to avoid a second traversal of the string by | |
# saving a list of offsets. However, it winds up being | |
# slower. Perhaps this is a sign of how well-optimized #gsub is? | |
def unindent_offsets | |
min_indent = nil | |
offsets = [] | |
scan(/^[\t ]*(?=\S)/) do |s| | |
offsets << $~.offset(0).first | |
min_indent ||= s.size | |
min_indent = (s.size < min_indent.size) ? s.size : min_indent | |
end | |
if min_indent | |
result = dup | |
shift = 0 | |
offsets.each do |offset| | |
result[offset - shift, min_indent] = "" | |
shift += min_indent | |
end | |
result | |
else | |
self | |
end | |
end | |
end | |
class TestUnindent < MiniTest::Unit::TestCase | |
def test_unindent_gem | |
assert_equal EXPECTED_TEXT, TEXT.unindent | |
end | |
def test_activesupport | |
assert_equal EXPECTED_TEXT, TEXT.strip_heredoc | |
end | |
def test_scan | |
assert_equal EXPECTED_TEXT, TEXT.unindent_scan | |
end | |
def test_offsets | |
assert_equal EXPECTED_TEXT, TEXT.unindent_offsets | |
end | |
end | |
require "benchmark" | |
n = 10_000 | |
Benchmark.bmbm(15) do |x| | |
x.report("unindent gem") { n.times { result = TEXT.unindent } } | |
x.report("activesupport") { n.times { result = TEXT.strip_heredoc } } | |
x.report("scan") { n.times { result = TEXT.unindent_scan } } | |
x.report("offsets") { n.times { result = TEXT.unindent_offsets } } | |
end | |
# ~> MiniTest::Unit::TestCase is now Minitest::Test. From -:130:in `<main>' | |
# >> Rehearsal --------------------------------------------------- | |
# >> unindent gem 0.850000 0.000000 0.850000 ( 0.859678) | |
# >> activesupport 0.630000 0.000000 0.630000 ( 0.638381) | |
# >> scan 0.510000 0.000000 0.510000 ( 0.502865) | |
# >> offsets 0.710000 0.010000 0.720000 ( 0.721911) | |
# >> ------------------------------------------ total: 2.710000sec | |
# >> | |
# >> user system total real | |
# >> unindent gem 0.770000 0.000000 0.770000 ( 0.773135) | |
# >> activesupport 0.620000 0.000000 0.620000 ( 0.627442) | |
# >> scan 0.490000 0.000000 0.490000 ( 0.497729) | |
# >> offsets 0.700000 0.000000 0.700000 ( 0.704367) | |
# >> Run options: --seed 50148 | |
# >> | |
# >> # Running: | |
# >> | |
# >> .... | |
# >> | |
# >> Finished in 0.001476s, 2710.3833 runs/s, 2710.3833 assertions/s. | |
# >> | |
# >> 4 runs, 4 assertions, 0 failures, 0 errors, 0 skips |
I got a slight performance increase by playing with the regex. Lookaheads are expensive, relatively. 😉
So I changed /^[\t ]*(?=\S)/
to this /^[ ]+/
# Calculating -------------------------------------
# unindent gem 972 i/100ms
# activesupport 1221 i/100ms
# scan 1590 i/100ms
# scan regex optimized 1647 i/100ms
# offsets 1059 i/100ms
# -------------------------------------------------
# unindent gem 10637.8 (±9.6%) i/s - 53460 in 5.080267s
# activesupport 13888.1 (±10.4%) i/s - 69597 in 5.073358s
# scan 17181.8 (±9.9%) i/s - 85860 in 5.053618s
# scan regex optimized 17848.0 (±10.1%) i/s - 88938 in 5.042961s
# offsets 11629.9 (±10.0%) i/s - 58245 in 5.064342s
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@avdi golfed :)
Not as fast as @chastell's but as fast as scan.
Assumes only spaces, may break in some edge cases.Edit: https://gist.github.com/danielfone/eacaf4a1f1d7f2ad425f
Edit 2: Accommodates tabs and passes all the tests from activesupport and unindent