jsuchal · November 5, 2009 07:45
diff --git a/Comments b/Comments
 # WebPage uses cache??? What the heck is cache in context of a webpage crawling process? It should only crawl pages. You need to inject some dummy cache to prevent caching. 

 # thumnail generation is an expensive job, thus lazy-loading. But you are not saving already generated thumbnails in cache. :(

 # Nice tests.
diff --git a/phpfashion_com_exercise.rb b/phpfashion_com_exercise.rb
 # Simple exercise based on call to action at
 # http://phpfashion.com/cisty-programatorsky-experiment

 require 'net/http'
 require 'tmpdir'
 require 'digest/md5'
 require 'fileutils'
 require 'pathname'

 module Crawler

  # Generic file-based storage. Do your own: MySQLStorage, etc.
  #
  # In real life, you'd create an abstract class to define the "interface" etc :P
  # In real life, you'd need some Cache class to wrap the Storage.
  # You need to ask the cache for data, not the storage. But we don't care here.
  # And <b>first of all</b>, in real life you'd need some expiration logic :)
  #
  class FileStorage
    def initialize(path=nil)
      raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path)
      path ||= Dir::tmpdir
      @store = Pathname.new(path)
    end
    def set(key, value);  File.open( @store.join(encode(key)), 'w' ) { |file| file << Marshal.dump(value)  }; end
    def get(key);         Marshal.load(File.read( @store.join(encode(key))));                                 end
    def exist?(key);      File.exist? @store.join(encode(key));                                               end
    private
    def encode(key);      Digest::MD5.hexdigest(key);                                                         end
  end

  # = Wrap a HTML page
  #
  # The +load+ method returns the representation of HTML page either from cache,
  # or from the network (and caches it)
  #
  # == Usage
  # 
  #   require 'web_page'
  #   page = Crawler::WebPage.load('htpp://example.com')
  #   puts page.url
  #   puts page.body
  #   puts page.headers.inspect
  #
  class WebPage

    @cache = Crawler::FileStorage.new

    def self.load(url)
      return cache.get( url ) if cache.exist?( url )
      url        = URI.parse(url)
      url.path   = '/' if url.path =~ /^$/
      client     = Net::HTTP.start(url.host, url.port)
      response   = client.request_get(url.path)
      webpage   = WebPage.new( url.to_s, response.body, response.to_hash )
      cache.set(url.to_s, webpage)
      return webpage
    end

    attr_reader   :url, :body, :headers
    def initialize(url, body, headers)
      @url, @body, @headers = url, body, headers
      self
    end

    def self.cache; @cache; end
    def thumbnail
      @thumbnail ||= create_thumbnail
    end

    private

    def create_thumbnail
      # Thumbnail.new(url) ... Some expensive logic ... etc
    end

  end

 end


 if $0 == __FILE__

  require 'test/unit'
  require 'rubygems'
  require 'fakeweb'
  require 'shoulda'

  include Crawler

  FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read
  FakeWeb.allow_net_connect = false

  class WebPageTest < Test::Unit::TestCase

    context "When loading a URL, it" do

      should "handle the request" do
        assert_nothing_raised { @webpage = WebPage.load('http://example.com/') }
        assert_not_nil @webpage
      end

      should "add trailing slash" do
        assert_nothing_raised { @webpage = WebPage.load('http://example.com') }
        assert_not_nil @webpage
      end

      should "return the URL with trailing slash back" do
        @webpage = WebPage.load('http://example.com')
        assert_equal 'http://example.com/', @webpage.url
      end

      should "parse the body" do
        @webpage = WebPage.load('http://example.com')
        assert_match /Example Web Page/, @webpage.body
      end

      should "parse the headers" do
        @webpage = WebPage.load('http://example.com')
        assert_not_nil @webpage.headers
        assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s
      end

      should "have thumbnail" do
        @webpage = WebPage.load('http://example.com')
        assert_respond_to @webpage, :thumbnail
      end

      should "cache the response" do
        @webpage = WebPage.load('http://example.com/')
        assert_not_nil WebPage.cache.exist?( 'http://example.com/' )
        @cached  = WebPage.cache.get( 'http://example.com/' )
        assert_instance_of WebPage, @cached
      end

      should "load valid web page from cache" do
        @webpage = WebPage.load('http://example.com/')
        @cached  = WebPage.cache.get( 'http://example.com/' )
        assert_equal @webpage.url,     @cached.url
        assert_equal @webpage.body,    @cached.body
        assert_equal @webpage.headers, @cached.headers
      end
    end

    # ---------------------------------------------------------------------------

    context "FileStorage" do
      setup do
        @tmp_path = File.join(File.dirname(__FILE__), 'tmp')
        FileUtils.mkdir_p @tmp_path
      end

      teardown do
        FileUtils.rm_rf @tmp_path
      end

      should "be initialized with a valid path" do
        assert_nothing_raised { @storage = FileStorage.new @tmp_path }
        assert File.exist?(@tmp_path), "Path does not exist"
      end

      should "raise when initialized with invalid path" do
        assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') }
      end

      should "not have data missing key" do
        @storage = FileStorage.new @tmp_path
        assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to')
      end

      should "have data for valid key" do
        @storage = FileStorage.new @tmp_path
        @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
        assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!"
      end

      should "store and retrieve data" do
        @storage = FileStorage.new @tmp_path
        @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
        assert_equal @storage.get('abc123')[:array], [1, 2, 3]
      end
    end

    # ---------------------------------------------------------------------------
    
  end

  
 end

 __END__
 HTTP/1.1 200 OK 
 Server: Apache/2.2.3 (Red Hat) 
 Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT 
 ETag: "b300b4-1b6-4059a80bfd280" 
 Accept-Ranges: bytes 
 Content-Type: text/html; charset=UTF-8 
 Connection: Keep-Alive 
 Date: Fri, 30 Oct 2009 09:20:03 GMT 
 Age: 2361    
 Content-Length: 438 

 <HTML> 
 <HEAD> 
   <TITLE>Example Web Page</TITLE> 
 </HEAD> 
 <body>   
 <p>You have reached this web page by typing &quot;example.com&quot;, 
 &quot;example.net&quot;, 
   or &quot;example.org&quot; into your web browser.</p> 
 <p>These domain names are reserved for use in documentation and are not available 
   for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC 
   2606</a>, Section 3.</p> 
 </BODY> 
 </HTML> 
diff --git a/zzz_lost_in_translation.rb b/zzz_lost_in_translation.rb
 # Now, let's play design patterns freaks!!!11 :D

 require 'phpfashion_com_exercise'

 module LostInTranslation

  # First, we need to untangle the "mess" in WebPage
  # Everything needs to be MUCH MORE complicated, right? :)
  # In Ruby, we just perform a little sugery on out
  # WebPage class. Let's do this!
  ::Crawler::WebPage.class_eval do
    @cache = nil
    def self.load(url)
      url        = URI.parse(url)
      url.path   = '/' if url.path =~ /^$/
      client     = Net::HTTP.start(url.host, url.port)
      response   = client.request_get(url.path)
      webpage   = ::Crawler::WebPage.new( url.to_s, response.body, response.to_hash )
      return webpage
    end
  end


  # Gee, now this makes MUCH MORE sense!
  # See, it's *storage* for chrissake, right in the name!!! :)
  class WebPageStorage

    @cache = Crawler::FileStorage.new
    def self.cache; @cache; end

    def self.load(url)
      return cache.get( url ) if cache.exist?( url ) # Already cached
      webpage = Crawler::WebPage.load( url )
      cache.set(url.to_s, webpage)                   # Cache the result
      return webpage
    end

  end

 end

 # We don't need no stinking tests, it's obvious, right?! :)
 # We just `puts inspect` something, hell, it's Friday anyway!

 puts "1/"
 puts LostInTranslation::WebPageStorage.load('http://example.com').inspect

 puts "---"

 puts "2/"
 puts LostInTranslation::WebPageStorage.load('http://example.com').inspect
	# WebPage uses cache??? What the heck is cache in context of a webpage crawling process? It should only crawl pages. You need to inject some dummy cache to prevent caching.

	# thumnail generation is an expensive job, thus lazy-loading. But you are not saving already generated thumbnails in cache. :(

	# Nice tests.
	# Simple exercise based on call to action at
	# http://phpfashion.com/cisty-programatorsky-experiment

	require 'net/http'
	require 'tmpdir'
	require 'digest/md5'
	require 'fileutils'
	require 'pathname'

	module Crawler

	# Generic file-based storage. Do your own: MySQLStorage, etc.
	#
	# In real life, you'd create an abstract class to define the "interface" etc :P
	# In real life, you'd need some Cache class to wrap the Storage.
	# You need to ask the cache for data, not the storage. But we don't care here.
	# And <b>first of all</b>, in real life you'd need some expiration logic :)
	#
	class FileStorage
	def initialize(path=nil)
	raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path)
	path \|\|= Dir::tmpdir
	@store = Pathname.new(path)
	end
	def set(key, value); File.open( @store.join(encode(key)), 'w' ) { \|file\| file << Marshal.dump(value) }; end
	def get(key); Marshal.load(File.read( @store.join(encode(key)))); end
	def exist?(key); File.exist? @store.join(encode(key)); end
	private
	def encode(key); Digest::MD5.hexdigest(key); end
	end

	# = Wrap a HTML page
	#
	# The +load+ method returns the representation of HTML page either from cache,
	# or from the network (and caches it)
	#
	# == Usage
	#
	# require 'web_page'
	# page = Crawler::WebPage.load('htpp://example.com')
	# puts page.url
	# puts page.body
	# puts page.headers.inspect
	#
	class WebPage

	@cache = Crawler::FileStorage.new

	def self.load(url)
	return cache.get( url ) if cache.exist?( url )
	url = URI.parse(url)
	url.path = '/' if url.path =~ /^$/
	client = Net::HTTP.start(url.host, url.port)
	response = client.request_get(url.path)
	webpage = WebPage.new( url.to_s, response.body, response.to_hash )
	cache.set(url.to_s, webpage)
	return webpage
	end

	attr_reader :url, :body, :headers
	def initialize(url, body, headers)
	@url, @body, @headers = url, body, headers
	self
	end

	def self.cache; @cache; end
	def thumbnail
	@thumbnail \|\|= create_thumbnail
	end

	private

	def create_thumbnail
	# Thumbnail.new(url) ... Some expensive logic ... etc
	end

	end

	end


	if $0 == __FILE__

	require 'test/unit'
	require 'rubygems'
	require 'fakeweb'
	require 'shoulda'

	include Crawler

	FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read
	FakeWeb.allow_net_connect = false

	class WebPageTest < Test::Unit::TestCase

	context "When loading a URL, it" do

	should "handle the request" do
	assert_nothing_raised { @webpage = WebPage.load('http://example.com/') }
	assert_not_nil @webpage
	end

	should "add trailing slash" do
	assert_nothing_raised { @webpage = WebPage.load('http://example.com') }
	assert_not_nil @webpage
	end

	should "return the URL with trailing slash back" do
	@webpage = WebPage.load('http://example.com')
	assert_equal 'http://example.com/', @webpage.url
	end

	should "parse the body" do
	@webpage = WebPage.load('http://example.com')
	assert_match /Example Web Page/, @webpage.body
	end

	should "parse the headers" do
	@webpage = WebPage.load('http://example.com')
	assert_not_nil @webpage.headers
	assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s
	end

	should "have thumbnail" do
	@webpage = WebPage.load('http://example.com')
	assert_respond_to @webpage, :thumbnail
	end

	should "cache the response" do
	@webpage = WebPage.load('http://example.com/')
	assert_not_nil WebPage.cache.exist?( 'http://example.com/' )
	@cached = WebPage.cache.get( 'http://example.com/' )
	assert_instance_of WebPage, @cached
	end

	should "load valid web page from cache" do
	@webpage = WebPage.load('http://example.com/')
	@cached = WebPage.cache.get( 'http://example.com/' )
	assert_equal @webpage.url, @cached.url
	assert_equal @webpage.body, @cached.body
	assert_equal @webpage.headers, @cached.headers
	end
	end

	# ---------------------------------------------------------------------------

	context "FileStorage" do
	setup do
	@tmp_path = File.join(File.dirname(__FILE__), 'tmp')
	FileUtils.mkdir_p @tmp_path
	end

	teardown do
	FileUtils.rm_rf @tmp_path
	end

	should "be initialized with a valid path" do
	assert_nothing_raised { @storage = FileStorage.new @tmp_path }
	assert File.exist?(@tmp_path), "Path does not exist"
	end

	should "raise when initialized with invalid path" do
	assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') }
	end

	should "not have data missing key" do
	@storage = FileStorage.new @tmp_path
	assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to')
	end

	should "have data for valid key" do
	@storage = FileStorage.new @tmp_path
	@storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
	assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!"
	end

	should "store and retrieve data" do
	@storage = FileStorage.new @tmp_path
	@storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
	assert_equal @storage.get('abc123')[:array], [1, 2, 3]
	end
	end

	# ---------------------------------------------------------------------------

	end


	end

	__END__
	HTTP/1.1 200 OK
	Server: Apache/2.2.3 (Red Hat)
	Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT
	ETag: "b300b4-1b6-4059a80bfd280"
	Accept-Ranges: bytes
	Content-Type: text/html; charset=UTF-8
	Connection: Keep-Alive
	Date: Fri, 30 Oct 2009 09:20:03 GMT
	Age: 2361
	Content-Length: 438

	<HTML>
	<HEAD>
	<TITLE>Example Web Page</TITLE>
	</HEAD>
	<body>
	<p>You have reached this web page by typing "example.com",
	"example.net",
	or "example.org" into your web browser.</p>
	<p>These domain names are reserved for use in documentation and are not available
	for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC
	2606</a>, Section 3.</p>
	</BODY>
	</HTML>
	# Now, let's play design patterns freaks!!!11 :D

	require 'phpfashion_com_exercise'

	module LostInTranslation

	# First, we need to untangle the "mess" in WebPage
	# Everything needs to be MUCH MORE complicated, right? :)
	# In Ruby, we just perform a little sugery on out
	# WebPage class. Let's do this!
	::Crawler::WebPage.class_eval do
	@cache = nil
	def self.load(url)
	url = URI.parse(url)
	url.path = '/' if url.path =~ /^$/
	client = Net::HTTP.start(url.host, url.port)
	response = client.request_get(url.path)
	webpage = ::Crawler::WebPage.new( url.to_s, response.body, response.to_hash )
	return webpage
	end
	end


	# Gee, now this makes MUCH MORE sense!
	# See, it's storage for chrissake, right in the name!!! :)
	class WebPageStorage

	@cache = Crawler::FileStorage.new
	def self.cache; @cache; end

	def self.load(url)
	return cache.get( url ) if cache.exist?( url ) # Already cached
	webpage = Crawler::WebPage.load( url )
	cache.set(url.to_s, webpage) # Cache the result
	return webpage
	end

	end

	end

	# We don't need no stinking tests, it's obvious, right?! :)
	# We just `puts inspect` something, hell, it's Friday anyway!

	puts "1/"
	puts LostInTranslation::WebPageStorage.load('http://example.com').inspect

	puts "---"

	puts "2/"
	puts LostInTranslation::WebPageStorage.load('http://example.com').inspect