Created
August 14, 2011 22:11
-
-
Save bmabey/1145373 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Given an input string and a dictionary of words, | |
segment the input string into a space-separated | |
sequence of dictionary words if possible. For | |
example, if the input string is "applepie" and | |
dictionary contains a standard set of English words, | |
then we would return the string "apple pie" as output. | |
http://thenoisychannel.com/2011/08/08/retiring-a-great-interview-problem/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns word-break.core | |
(:use midje.sweet | |
[clojure.contrib.core :only [-?> -?>>]])) | |
(defn word-break [input dict] | |
(if (contains? dict input) | |
input | |
(let [length (count input) | |
candidates (map #(list (subs input 0 %) %) (range length)) | |
prefixes (filter (fn [[c _]] (contains? dict c)) candidates)] | |
(first (for [[prefix i] prefixes | |
:let [suffix (-?> (subs input i length) (word-break dict))] | |
:when suffix] | |
(str prefix " " suffix)))))) | |
;; inspired by http://stackoverflow.com/questions/3906831/how-do-i-generate-memoized-recursive-functions-in-clojure | |
(defmacro memoize-fn | |
"Produces a memoized anonymous function that can recursively call itself." | |
[fn-name & fn-args] | |
`(with-local-vars | |
[~fn-name (memoize | |
(fn ~@fn-args))] | |
(.bindRoot ~fn-name @~fn-name) | |
@~fn-name)) | |
(defn word-break-with-memoization [input dict] | |
(let [word-break | |
(memoize-fn word-break [input] | |
(if (contains? dict input) | |
input | |
(let [length (count input) | |
candidates (map #(list (subs input 0 %) %) (range length)) | |
prefixes (filter (fn [[c _]] (contains? dict c)) candidates)] | |
(first (for [[prefix i] prefixes | |
:let [suffix (-?> (subs input i length) word-break)] | |
:when suffix] | |
(str prefix " " suffix))))))] | |
(word-break input))) | |
(fact | |
(word-break "applepie" #{"apple" "pie"}) => "apple pie" | |
(word-break "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day" | |
(word-break "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil | |
(word-break-with-memoization "applepie" #{"apple" "pie"}) => "apple pie" | |
(word-break-with-memoization "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day" | |
(word-break-with-memoization "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'benchmark' | |
require 'rubygems' | |
require 'facets/enumerator' | |
require 'facets/enumerable/defer' | |
warmup = 2 # The additional warmups are for HotSpot's benefit | |
(warmup + 1).times do | |
Benchmark.bmbm do |x| | |
x.report("non-lazy one op") { (1..100000).map{|i| i + 1} } | |
x.report("lazy one op") { (1..100000).defer.map{|i| i + 1}.to_a } | |
x.report("non-lazy two ops") { (1..100000).map{|i| i + 1}.map{|i| i + 1} } | |
x.report("lazy two ops") { (1..100000).defer.map{|i| i + 1}.map{|i| i + 1}.to_a } | |
x.report("non-lazy three ops") { (1..100000).map{|i| i + 1}.map{|i| i + 1}.map{|i| i + 1} } | |
x.report("lazy three ops") { (1..100000).defer.map{|i| i + 1}.map{|i| i + 1}.map{|i| i + 1}.to_a } | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/env ruby | |
require 'set' | |
require 'rubygems' | |
require 'rspec' | |
require 'facets/enumerator' | |
require 'facets/enumerable/defer' | |
# This monkey patch allows for destructing of the generated/yielded items... | |
# TODO: open a pull request with Facets with this change... | |
class Denumerator | |
def to_ary | |
self.next.to_ary | |
end | |
end | |
module WordBreak | |
class << self | |
def segmentize(string, dictionary) | |
return string if dictionary.include?(string) | |
string.size.times do |i| | |
prefix = string[0..i] | |
next unless dictionary.include?(prefix) | |
remaining_segment = segmentize(string[(i+1)..-1], dictionary) | |
return [prefix, remaining_segment].join(" ") if remaining_segment | |
end | |
nil | |
end | |
def segmentize_with_memoization(string, dictionary) | |
segmentize = memoize( -> string { | |
return string if dictionary.include?(string) | |
string.size.times do |i| | |
prefix = string[0..i] | |
next unless dictionary.include?(prefix) | |
remaining_segment = segmentize.call(string[(i+1)..-1]) | |
return [prefix, remaining_segment].join(" ") if remaining_segment | |
end | |
nil | |
}) | |
segmentize.call(string) | |
end | |
def lazy_segmentize(string, dictionary) | |
return string if dictionary.include?(string) | |
(0...(string.size)).defer. | |
map { |i| [string[0..i], i]}. | |
select { |(prefix,i)| dictionary.include?(prefix) }. | |
map{ |(prefix,i)| | |
remaining_segment = lazy_segmentize(string[(i+1)..-1], dictionary) | |
[prefix, remaining_segment].join(" ") if remaining_segment }. | |
reject {|r| r.nil? }. | |
first | |
end | |
def lazy_segmentize_with_memoization(string, dictionary) | |
segmentize = memoize( | |
-> string { | |
return string if dictionary.include?(string) | |
(0...(string.size)).defer. | |
map { |i| [string[0..i], i]}. | |
select { |(prefix,i)| dictionary.include?(prefix) }. | |
map{ |(prefix,i)| | |
remaining_segment = segmentize.call(string[(i+1)..-1]) | |
[prefix, remaining_segment].join(" ") if remaining_segment }. | |
reject {|r| r.nil? }. | |
first}) | |
segmentize.call(string) | |
end | |
private | |
def memoize(fn) | |
cache = {} | |
-> *args { cache.include?(args) ? cache[args] : cache[args] = fn.call(*args) } | |
end | |
end | |
end | |
describe WordBreak do | |
describe ".segmentize" do | |
specify do | |
WordBreak.segmentize("applepie", %w[apple pie].to_set).should == "apple pie" | |
WordBreak.segmentize("blah", %w[apple pie].to_set).should be_nil | |
WordBreak.segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day" | |
WordBreak.segmentize("aaaab", %w[a aa aaa aaaa].to_set).should be_nil | |
end | |
end | |
describe ".segmentize_with_memoization" do | |
it do | |
#WordBreak.segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie" | |
#WordBreak.segmentize_with_memoization("blah", %w[apple pie].to_set).should be_nil | |
#WordBreak.segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day" | |
WordBreak.segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil | |
end | |
end | |
describe ".lazy_segmentize" do | |
specify do | |
WordBreak.lazy_segmentize("applepie", %w[apple pie].to_set).should == "apple pie" | |
WordBreak.lazy_segmentize("blah", %w[apple pie].to_set).should == nil | |
WordBreak.lazy_segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day" | |
end | |
end | |
describe ".lazy_segmentize_with_memoization" do | |
specify do | |
WordBreak.lazy_segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie" | |
WordBreak.lazy_segmentize_with_memoization("blah", %w[apple pie].to_set).should == nil | |
WordBreak.lazy_segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day" | |
WordBreak.lazy_segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil | |
end | |
end | |
end | |
class Enumerator | |
def self.iterate(initial_value, &iterate_fn) | |
Enumerator.new do |yielder| | |
yielder.yield initial_value | |
current_value = initial_value | |
loop do | |
current_value = iterate_fn.call(current_value) | |
yielder.yield current_value | |
end | |
end | |
end | |
end | |
describe "Enumerator::iterate" do | |
it "returns an Enumerator of the block provided over the initial value and each following result" do | |
numbers = Enumerator.iterate(1) { |v| v + 1} | |
numbers.take(4).should == [1, 2, 3, 4] | |
doubles = Enumerator.iterate(1) { |v| v * 2} | |
doubles.take(4).should == [1, 2, 4, 8] | |
end | |
end | |
require 'benchmark' | |
require 'facets/random' | |
def worst_case(n) | |
["a" * n + "b", | |
Enumerator.iterate("a"){|i| i + "a"}.take(n-1).to_a] # see what I did here? ;) | |
end | |
dictionary = File.open("/usr/share/dict/words") do |f| | |
f.enum_for(:each_line).defer.map{|l| l.strip}.take(300).to_a # yep, here too... | |
end | |
def dictionary.random_words(num_words) | |
Random.srand(1) # make it deterministic | |
self.rand_subset(num_words, false).join("") | |
end | |
warmup = 2 # The additional warmups are for HotSpot's benefit | |
(warmup + 1).times do | |
Benchmark.bmbm do |x| | |
[10, 50, 100, 200, 300, 400].each do |n| | |
input, dictionary = worst_case(n) | |
x.report("non-lazy w/memo; n = #{n}") { WordBreak.segmentize_with_memoization(input, dictionary)} | |
x.report("lazy w/memo; n = #{n}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)} | |
end | |
#[10, 50, 100, 200].each do |num_words| | |
# input = dictionary.random_words(num_words) | |
# x.report("non-lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.segmentize_with_memoization(input, dictionary)} | |
# x.report("lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)} | |
#end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment