flavorjones · May 22, 2009 04:59
diff --git a/newline-strip.rb b/newline-strip.rb
 #! /usr/bin/ruby

 #
 #  I've got a string containing newlines. I've also got a regex that I
 #  want to match at the start of the string, but NOT at the start of a
 #  line.
 #
 #  Contrived example:
 #    - strings: "first string\nshould match", "second string should not match\nfirst string"
 #    - regex:   /^first string/
 #  We want to match only on the first string, not on the second string.
 #
 #  Seems like the best way to do this, assuming my regex does not
 #  contain newlines, is to remove everything after the first newline
 #  in the string.
 #
 #  But, then, what's the fastest way to remove everything after the first newline?
 #

 require 'benchmark'
 require 'open-uri'

 #doc = open("http://www.slashdot.com/").read # large doc
 doc = "   first string\nshould match" # small doc

 puts "length: #{doc.length}"

 N = 100_000
 Benchmark.bm(15) do |bm|
  bm.report("gsub:")        { N.times do ; doc.gsub("\n.*","") ; end }
  bm.report("delete:")      { N.times do ; doc.delete("\n") ; end }
  bm.report("slice(re):")   { N.times do ; doc.slice(/^[^\n]+/) ; end }
  bm.report("index+slice:") { N.times do ; j = doc.index("\n") ; doc.slice(0,j) ; end }
 end

 ## where 'doc' is large
 # length: 108602
 #                      user     system      total        real
 # gsub:           31.900000   9.030000  40.930000 ( 40.960776)
 # delete:         23.130000   0.080000  23.210000 ( 23.241717)
 # slice(re):       0.330000   0.020000   0.350000 (  0.347720)
 # index+slice:     0.130000   0.020000   0.150000 (  0.148842)

 ## where 'doc' is small
 # length: 25
 #                      user     system      total        real
 # gsub:            0.120000   0.020000   0.140000 (  0.132871)
 # delete:          0.150000   0.010000   0.160000 (  0.158218)
 # slice(re):       0.170000   0.020000   0.190000 (  0.185259)
 # index+slice:     0.100000   0.020000   0.120000 (  0.125409)
	#! /usr/bin/ruby

	#
	# I've got a string containing newlines. I've also got a regex that I
	# want to match at the start of the string, but NOT at the start of a
	# line.
	#
	# Contrived example:
	# - strings: "first string\nshould match", "second string should not match\nfirst string"
	# - regex: /^first string/
	# We want to match only on the first string, not on the second string.
	#
	# Seems like the best way to do this, assuming my regex does not
	# contain newlines, is to remove everything after the first newline
	# in the string.
	#
	# But, then, what's the fastest way to remove everything after the first newline?
	#

	require 'benchmark'
	require 'open-uri'

	#doc = open("http://www.slashdot.com/").read # large doc
	doc = " first string\nshould match" # small doc

	puts "length: #{doc.length}"

	N = 100_000
	Benchmark.bm(15) do \|bm\|
	bm.report("gsub:") { N.times do ; doc.gsub("\n.*","") ; end }
	bm.report("delete:") { N.times do ; doc.delete("\n") ; end }
	bm.report("slice(re):") { N.times do ; doc.slice(/^[^\n]+/) ; end }
	bm.report("index+slice:") { N.times do ; j = doc.index("\n") ; doc.slice(0,j) ; end }
	end

	## where 'doc' is large
	# length: 108602
	# user system total real
	# gsub: 31.900000 9.030000 40.930000 ( 40.960776)
	# delete: 23.130000 0.080000 23.210000 ( 23.241717)
	# slice(re): 0.330000 0.020000 0.350000 ( 0.347720)
	# index+slice: 0.130000 0.020000 0.150000 ( 0.148842)

	## where 'doc' is small
	# length: 25
	# user system total real
	# gsub: 0.120000 0.020000 0.140000 ( 0.132871)
	# delete: 0.150000 0.010000 0.160000 ( 0.158218)
	# slice(re): 0.170000 0.020000 0.190000 ( 0.185259)
	# index+slice: 0.100000 0.020000 0.120000 ( 0.125409)