bmabey · August 14, 2011 22:11
diff --git a/a description of the problem.txt b/a description of the problem.txt
 Given an input string and a dictionary of words,
 segment the input string into a space-separated
 sequence of dictionary words if possible. For
 example, if the input string is "applepie" and
 dictionary contains a standard set of English words,
 then we would return the string "apple pie" as output.

 http://thenoisychannel.com/2011/08/08/retiring-a-great-interview-problem/
diff --git a/core.clj b/core.clj
 (ns word-break.core
  (:use midje.sweet
        [clojure.contrib.core :only [-?> -?>>]]))

 (defn word-break [input dict]
  (if (contains? dict input)
    input
    (let [length (count input)
          candidates (map #(list (subs input 0 %) %) (range length))
          prefixes (filter (fn [[c _]] (contains? dict c)) candidates)]
      (first (for [[prefix i] prefixes
                   :let [suffix (-?> (subs input i length) (word-break dict))]
                   :when suffix]
               (str prefix " " suffix))))))

 ;; inspired by http://stackoverflow.com/questions/3906831/how-do-i-generate-memoized-recursive-functions-in-clojure
 (defmacro memoize-fn
  "Produces a memoized anonymous function that can recursively call itself."
  [fn-name & fn-args]
  `(with-local-vars
      [~fn-name (memoize
                (fn ~@fn-args))]
     (.bindRoot ~fn-name @~fn-name)
     @~fn-name))

 (defn word-break-with-memoization [input dict]
  (let [word-break
        (memoize-fn word-break [input]
                    (if (contains? dict input)
                      input
                      (let [length (count input)
                            candidates (map #(list (subs input 0 %) %) (range length))
                            prefixes (filter (fn [[c _]] (contains? dict c)) candidates)]
                        (first (for [[prefix i] prefixes
                                     :let [suffix (-?> (subs input i length) word-break)]
                                     :when suffix]
                                 (str prefix " " suffix))))))]
    (word-break input)))

 (fact
  (word-break "applepie" #{"apple" "pie"}) => "apple pie"
  (word-break "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day"
  (word-break "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil

  (word-break-with-memoization "applepie" #{"apple" "pie"}) => "apple pie"
  (word-break-with-memoization "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day"
  (word-break-with-memoization "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil)
diff --git a/lazy_benchmark.rb b/lazy_benchmark.rb
 require 'benchmark'
 require 'rubygems'

 require 'facets/enumerator'
 require 'facets/enumerable/defer'

 warmup = 2 # The additional warmups are for HotSpot's benefit
 (warmup + 1).times do
  Benchmark.bmbm do |x|
    x.report("non-lazy one op")  { (1..100000).map{|i| i + 1} }
    x.report("lazy one op") { (1..100000).defer.map{|i| i + 1}.to_a }
    x.report("non-lazy two ops")  { (1..100000).map{|i| i + 1}.map{|i| i + 1} }
    x.report("lazy two ops") { (1..100000).defer.map{|i| i + 1}.map{|i| i + 1}.to_a }
    x.report("non-lazy three ops")  { (1..100000).map{|i| i + 1}.map{|i| i + 1}.map{|i| i + 1} }
    x.report("lazy three ops") { (1..100000).defer.map{|i| i + 1}.map{|i| i + 1}.map{|i| i + 1}.to_a }
  end
 end
diff --git a/word_break.rb b/word_break.rb
 #/usr/bin/env ruby
 require 'set'
 require 'rubygems'
 require 'rspec'
 require 'facets/enumerator'
 require 'facets/enumerable/defer'


 # This monkey patch allows for destructing of the generated/yielded items...
 # TODO: open a pull request with Facets with this change...
 class Denumerator
  def to_ary
    self.next.to_ary
  end
 end

 module WordBreak
  class << self
    def segmentize(string, dictionary)
      return string if dictionary.include?(string)
      string.size.times do |i|
        prefix = string[0..i]
        next unless dictionary.include?(prefix)
        remaining_segment = segmentize(string[(i+1)..-1], dictionary)
        return [prefix, remaining_segment].join(" ") if remaining_segment
      end
      nil
    end


    def segmentize_with_memoization(string, dictionary)
      segmentize = memoize( -> string {
        return string if dictionary.include?(string)
        string.size.times do |i|
          prefix = string[0..i]
          next unless dictionary.include?(prefix)
          remaining_segment = segmentize.call(string[(i+1)..-1])
          return [prefix, remaining_segment].join(" ") if remaining_segment
        end
        nil
      })
      segmentize.call(string)
    end

    def lazy_segmentize(string, dictionary)
      return string if dictionary.include?(string)
      (0...(string.size)).defer.
        map { |i| [string[0..i], i]}.
        select { |(prefix,i)| dictionary.include?(prefix) }.
        map{ |(prefix,i)|
          remaining_segment = lazy_segmentize(string[(i+1)..-1], dictionary)
          [prefix, remaining_segment].join(" ") if remaining_segment }.
        reject {|r| r.nil? }.
        first
    end


    def lazy_segmentize_with_memoization(string, dictionary)
      segmentize = memoize(
        -> string {
        return string if dictionary.include?(string)
        (0...(string.size)).defer.
        map { |i| [string[0..i], i]}.
        select { |(prefix,i)| dictionary.include?(prefix) }.
        map{ |(prefix,i)|
          remaining_segment = segmentize.call(string[(i+1)..-1])
          [prefix, remaining_segment].join(" ") if remaining_segment }.
        reject {|r| r.nil? }.
        first})
      segmentize.call(string)
    end

    private

    def memoize(fn)
      cache = {}
      -> *args { cache.include?(args) ? cache[args] : cache[args] = fn.call(*args) }
    end
  end
 end

 describe WordBreak do
  describe ".segmentize" do
    specify do
      WordBreak.segmentize("applepie", %w[apple pie].to_set).should == "apple pie"
      WordBreak.segmentize("blah", %w[apple pie].to_set).should be_nil
      WordBreak.segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
      WordBreak.segmentize("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
    end
  end

  describe ".segmentize_with_memoization" do
    it do
      #WordBreak.segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie"
      #WordBreak.segmentize_with_memoization("blah", %w[apple pie].to_set).should be_nil
      #WordBreak.segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
      WordBreak.segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
    end
  end

  describe ".lazy_segmentize" do
    specify do
      WordBreak.lazy_segmentize("applepie", %w[apple pie].to_set).should == "apple pie"
      WordBreak.lazy_segmentize("blah", %w[apple pie].to_set).should == nil
      WordBreak.lazy_segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
    end
  end

  describe ".lazy_segmentize_with_memoization" do
    specify do
      WordBreak.lazy_segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie"
      WordBreak.lazy_segmentize_with_memoization("blah", %w[apple pie].to_set).should == nil
      WordBreak.lazy_segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
      WordBreak.lazy_segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
    end
  end

 end

 class Enumerator
  def self.iterate(initial_value, &iterate_fn)
    Enumerator.new do |yielder|
      yielder.yield initial_value
      current_value = initial_value
      loop do
        current_value = iterate_fn.call(current_value)
        yielder.yield current_value
      end
    end
  end
 end


 describe "Enumerator::iterate" do
  it "returns an Enumerator of the block provided over the initial value and each following result" do
    numbers = Enumerator.iterate(1) { |v| v + 1}
    numbers.take(4).should == [1, 2, 3, 4]
    doubles = Enumerator.iterate(1) { |v| v * 2}
    doubles.take(4).should == [1, 2, 4, 8]
  end
 end

 require 'benchmark'
 require 'facets/random'

 def worst_case(n)
  ["a" * n + "b",
   Enumerator.iterate("a"){|i| i + "a"}.take(n-1).to_a] # see what I did here? ;)
 end

 dictionary = File.open("/usr/share/dict/words") do |f|
  f.enum_for(:each_line).defer.map{|l| l.strip}.take(300).to_a # yep, here too...
 end

 def dictionary.random_words(num_words)
  Random.srand(1) # make it deterministic
  self.rand_subset(num_words, false).join("")
 end


 warmup = 2 # The additional warmups are for HotSpot's benefit
 (warmup + 1).times do
  Benchmark.bmbm do |x|
    [10, 50, 100, 200, 300, 400].each do |n|
      input, dictionary = worst_case(n)
      x.report("non-lazy w/memo; n = #{n}") { WordBreak.segmentize_with_memoization(input, dictionary)}
      x.report("lazy w/memo; n = #{n}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)}
    end

    #[10, 50, 100, 200].each do |num_words|
    #  input = dictionary.random_words(num_words)
    #  x.report("non-lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.segmentize_with_memoization(input, dictionary)}
    #  x.report("lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)}
    #end
  end
 end
	Given an input string and a dictionary of words,
	segment the input string into a space-separated
	sequence of dictionary words if possible. For
	example, if the input string is "applepie" and
	dictionary contains a standard set of English words,
	then we would return the string "apple pie" as output.

	http://thenoisychannel.com/2011/08/08/retiring-a-great-interview-problem/
	(ns word-break.core
	(:use midje.sweet
	[clojure.contrib.core :only [-?> -?>>]]))

	(defn word-break [input dict]
	(if (contains? dict input)
	input
	(let [length (count input)
	candidates (map #(list (subs input 0 %) %) (range length))
	prefixes (filter (fn [[c _]] (contains? dict c)) candidates)]
	(first (for [[prefix i] prefixes
	:let [suffix (-?> (subs input i length) (word-break dict))]
	:when suffix]
	(str prefix " " suffix))))))

	;; inspired by http://stackoverflow.com/questions/3906831/how-do-i-generate-memoized-recursive-functions-in-clojure
	(defmacro memoize-fn
	"Produces a memoized anonymous function that can recursively call itself."
	[fn-name & fn-args]
	`(with-local-vars
	[~fn-name (memoize
	(fn ~@fn-args))]
	(.bindRoot ~fn-name @~fn-name)
	@~fn-name))

	(defn word-break-with-memoization [input dict]
	(let [word-break
	(memoize-fn word-break [input]
	(if (contains? dict input)
	input
	(let [length (count input)
	candidates (map #(list (subs input 0 %) %) (range length))
	prefixes (filter (fn [[c _]] (contains? dict c)) candidates)]
	(first (for [[prefix i] prefixes
	:let [suffix (-?> (subs input i length) word-break)]
	:when suffix]
	(str prefix " " suffix))))))]
	(word-break input)))

	(fact
	(word-break "applepie" #{"apple" "pie"}) => "apple pie"
	(word-break "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day"
	(word-break "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil

	(word-break-with-memoization "applepie" #{"apple" "pie"}) => "apple pie"
	(word-break-with-memoization "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day"
	(word-break-with-memoization "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil)
	require 'benchmark'
	require 'rubygems'

	require 'facets/enumerator'
	require 'facets/enumerable/defer'

	warmup = 2 # The additional warmups are for HotSpot's benefit
	(warmup + 1).times do
	Benchmark.bmbm do \|x\|
	x.report("non-lazy one op") { (1..100000).map{\|i\| i + 1} }
	x.report("lazy one op") { (1..100000).defer.map{\|i\| i + 1}.to_a }
	x.report("non-lazy two ops") { (1..100000).map{\|i\| i + 1}.map{\|i\| i + 1} }
	x.report("lazy two ops") { (1..100000).defer.map{\|i\| i + 1}.map{\|i\| i + 1}.to_a }
	x.report("non-lazy three ops") { (1..100000).map{\|i\| i + 1}.map{\|i\| i + 1}.map{\|i\| i + 1} }
	x.report("lazy three ops") { (1..100000).defer.map{\|i\| i + 1}.map{\|i\| i + 1}.map{\|i\| i + 1}.to_a }
	end
	end
	#/usr/bin/env ruby
	require 'set'
	require 'rubygems'
	require 'rspec'
	require 'facets/enumerator'
	require 'facets/enumerable/defer'


	# This monkey patch allows for destructing of the generated/yielded items...
	# TODO: open a pull request with Facets with this change...
	class Denumerator
	def to_ary
	self.next.to_ary
	end
	end

	module WordBreak
	class << self
	def segmentize(string, dictionary)
	return string if dictionary.include?(string)
	string.size.times do \|i\|
	prefix = string[0..i]
	next unless dictionary.include?(prefix)
	remaining_segment = segmentize(string[(i+1)..-1], dictionary)
	return [prefix, remaining_segment].join(" ") if remaining_segment
	end
	nil
	end


	def segmentize_with_memoization(string, dictionary)
	segmentize = memoize( -> string {
	return string if dictionary.include?(string)
	string.size.times do \|i\|
	prefix = string[0..i]
	next unless dictionary.include?(prefix)
	remaining_segment = segmentize.call(string[(i+1)..-1])
	return [prefix, remaining_segment].join(" ") if remaining_segment
	end
	nil
	})
	segmentize.call(string)
	end

	def lazy_segmentize(string, dictionary)
	return string if dictionary.include?(string)
	(0...(string.size)).defer.
	map { \|i\| [string[0..i], i]}.
	select { \|(prefix,i)\| dictionary.include?(prefix) }.
	map{ \|(prefix,i)\|
	remaining_segment = lazy_segmentize(string[(i+1)..-1], dictionary)
	[prefix, remaining_segment].join(" ") if remaining_segment }.
	reject {\|r\| r.nil? }.
	first
	end


	def lazy_segmentize_with_memoization(string, dictionary)
	segmentize = memoize(
	-> string {
	return string if dictionary.include?(string)
	(0...(string.size)).defer.
	map { \|i\| [string[0..i], i]}.
	select { \|(prefix,i)\| dictionary.include?(prefix) }.
	map{ \|(prefix,i)\|
	remaining_segment = segmentize.call(string[(i+1)..-1])
	[prefix, remaining_segment].join(" ") if remaining_segment }.
	reject {\|r\| r.nil? }.
	first})
	segmentize.call(string)
	end

	private

	def memoize(fn)
	cache = {}
	-> args { cache.include?(args) ? cache[args] : cache[args] = fn.call(args) }
	end
	end
	end

	describe WordBreak do
	describe ".segmentize" do
	specify do
	WordBreak.segmentize("applepie", %w[apple pie].to_set).should == "apple pie"
	WordBreak.segmentize("blah", %w[apple pie].to_set).should be_nil
	WordBreak.segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
	WordBreak.segmentize("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
	end
	end

	describe ".segmentize_with_memoization" do
	it do
	#WordBreak.segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie"
	#WordBreak.segmentize_with_memoization("blah", %w[apple pie].to_set).should be_nil
	#WordBreak.segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
	WordBreak.segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
	end
	end

	describe ".lazy_segmentize" do
	specify do
	WordBreak.lazy_segmentize("applepie", %w[apple pie].to_set).should == "apple pie"
	WordBreak.lazy_segmentize("blah", %w[apple pie].to_set).should == nil
	WordBreak.lazy_segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
	end
	end

	describe ".lazy_segmentize_with_memoization" do
	specify do
	WordBreak.lazy_segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie"
	WordBreak.lazy_segmentize_with_memoization("blah", %w[apple pie].to_set).should == nil
	WordBreak.lazy_segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
	WordBreak.lazy_segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
	end
	end

	end

	class Enumerator
	def self.iterate(initial_value, &iterate_fn)
	Enumerator.new do \|yielder\|
	yielder.yield initial_value
	current_value = initial_value
	loop do
	current_value = iterate_fn.call(current_value)
	yielder.yield current_value
	end
	end
	end
	end


	describe "Enumerator::iterate" do
	it "returns an Enumerator of the block provided over the initial value and each following result" do
	numbers = Enumerator.iterate(1) { \|v\| v + 1}
	numbers.take(4).should == [1, 2, 3, 4]
	doubles = Enumerator.iterate(1) { \|v\| v * 2}
	doubles.take(4).should == [1, 2, 4, 8]
	end
	end

	require 'benchmark'
	require 'facets/random'

	def worst_case(n)
	["a" * n + "b",
	Enumerator.iterate("a"){\|i\| i + "a"}.take(n-1).to_a] # see what I did here? ;)
	end

	dictionary = File.open("/usr/share/dict/words") do \|f\|
	f.enum_for(:each_line).defer.map{\|l\| l.strip}.take(300).to_a # yep, here too...
	end

	def dictionary.random_words(num_words)
	Random.srand(1) # make it deterministic
	self.rand_subset(num_words, false).join("")
	end


	warmup = 2 # The additional warmups are for HotSpot's benefit
	(warmup + 1).times do
	Benchmark.bmbm do \|x\|
	[10, 50, 100, 200, 300, 400].each do \|n\|
	input, dictionary = worst_case(n)
	x.report("non-lazy w/memo; n = #{n}") { WordBreak.segmentize_with_memoization(input, dictionary)}
	x.report("lazy w/memo; n = #{n}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)}
	end

	#[10, 50, 100, 200].each do \|num_words\|
	# input = dictionary.random_words(num_words)
	# x.report("non-lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.segmentize_with_memoization(input, dictionary)}
	# x.report("lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)}
	#end
	end
	end