Skip to content

Instantly share code, notes, and snippets.

@jsuchal
Forked from karmi/phpfashion_com_exercise.rb
Created November 5, 2009 07:45
Show Gist options
  • Save jsuchal/226863 to your computer and use it in GitHub Desktop.
Save jsuchal/226863 to your computer and use it in GitHub Desktop.
# WebPage uses cache??? What the heck is cache in context of a webpage crawling process? It should only crawl pages. You need to inject some dummy cache to prevent caching.
# thumnail generation is an expensive job, thus lazy-loading. But you are not saving already generated thumbnails in cache. :(
# Nice tests.
# Simple exercise based on call to action at
# http://phpfashion.com/cisty-programatorsky-experiment
require 'net/http'
require 'tmpdir'
require 'digest/md5'
require 'fileutils'
require 'pathname'
module Crawler
# Generic file-based storage. Do your own: MySQLStorage, etc.
#
# In real life, you'd create an abstract class to define the "interface" etc :P
# In real life, you'd need some Cache class to wrap the Storage.
# You need to ask the cache for data, not the storage. But we don't care here.
# And <b>first of all</b>, in real life you'd need some expiration logic :)
#
class FileStorage
def initialize(path=nil)
raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path)
path ||= Dir::tmpdir
@store = Pathname.new(path)
end
def set(key, value); File.open( @store.join(encode(key)), 'w' ) { |file| file << Marshal.dump(value) }; end
def get(key); Marshal.load(File.read( @store.join(encode(key)))); end
def exist?(key); File.exist? @store.join(encode(key)); end
private
def encode(key); Digest::MD5.hexdigest(key); end
end
# = Wrap a HTML page
#
# The +load+ method returns the representation of HTML page either from cache,
# or from the network (and caches it)
#
# == Usage
#
# require 'web_page'
# page = Crawler::WebPage.load('htpp://example.com')
# puts page.url
# puts page.body
# puts page.headers.inspect
#
class WebPage
@cache = Crawler::FileStorage.new
def self.load(url)
return cache.get( url ) if cache.exist?( url )
url = URI.parse(url)
url.path = '/' if url.path =~ /^$/
client = Net::HTTP.start(url.host, url.port)
response = client.request_get(url.path)
webpage = WebPage.new( url.to_s, response.body, response.to_hash )
cache.set(url.to_s, webpage)
return webpage
end
attr_reader :url, :body, :headers
def initialize(url, body, headers)
@url, @body, @headers = url, body, headers
self
end
def self.cache; @cache; end
def thumbnail
@thumbnail ||= create_thumbnail
end
private
def create_thumbnail
# Thumbnail.new(url) ... Some expensive logic ... etc
end
end
end
if $0 == __FILE__
require 'test/unit'
require 'rubygems'
require 'fakeweb'
require 'shoulda'
include Crawler
FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read
FakeWeb.allow_net_connect = false
class WebPageTest < Test::Unit::TestCase
context "When loading a URL, it" do
should "handle the request" do
assert_nothing_raised { @webpage = WebPage.load('http://example.com/') }
assert_not_nil @webpage
end
should "add trailing slash" do
assert_nothing_raised { @webpage = WebPage.load('http://example.com') }
assert_not_nil @webpage
end
should "return the URL with trailing slash back" do
@webpage = WebPage.load('http://example.com')
assert_equal 'http://example.com/', @webpage.url
end
should "parse the body" do
@webpage = WebPage.load('http://example.com')
assert_match /Example Web Page/, @webpage.body
end
should "parse the headers" do
@webpage = WebPage.load('http://example.com')
assert_not_nil @webpage.headers
assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s
end
should "have thumbnail" do
@webpage = WebPage.load('http://example.com')
assert_respond_to @webpage, :thumbnail
end
should "cache the response" do
@webpage = WebPage.load('http://example.com/')
assert_not_nil WebPage.cache.exist?( 'http://example.com/' )
@cached = WebPage.cache.get( 'http://example.com/' )
assert_instance_of WebPage, @cached
end
should "load valid web page from cache" do
@webpage = WebPage.load('http://example.com/')
@cached = WebPage.cache.get( 'http://example.com/' )
assert_equal @webpage.url, @cached.url
assert_equal @webpage.body, @cached.body
assert_equal @webpage.headers, @cached.headers
end
end
# ---------------------------------------------------------------------------
context "FileStorage" do
setup do
@tmp_path = File.join(File.dirname(__FILE__), 'tmp')
FileUtils.mkdir_p @tmp_path
end
teardown do
FileUtils.rm_rf @tmp_path
end
should "be initialized with a valid path" do
assert_nothing_raised { @storage = FileStorage.new @tmp_path }
assert File.exist?(@tmp_path), "Path does not exist"
end
should "raise when initialized with invalid path" do
assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') }
end
should "not have data missing key" do
@storage = FileStorage.new @tmp_path
assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to')
end
should "have data for valid key" do
@storage = FileStorage.new @tmp_path
@storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!"
end
should "store and retrieve data" do
@storage = FileStorage.new @tmp_path
@storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
assert_equal @storage.get('abc123')[:array], [1, 2, 3]
end
end
# ---------------------------------------------------------------------------
end
end
__END__
HTTP/1.1 200 OK
Server: Apache/2.2.3 (Red Hat)
Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT
ETag: "b300b4-1b6-4059a80bfd280"
Accept-Ranges: bytes
Content-Type: text/html; charset=UTF-8
Connection: Keep-Alive
Date: Fri, 30 Oct 2009 09:20:03 GMT
Age: 2361   
Content-Length: 438
<HTML>
<HEAD>
  <TITLE>Example Web Page</TITLE>
</HEAD>
<body>  
<p>You have reached this web page by typing &quot;example.com&quot;,
&quot;example.net&quot;,
  or &quot;example.org&quot; into your web browser.</p>
<p>These domain names are reserved for use in documentation and are not available
  for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC
  2606</a>, Section 3.</p>
</BODY>
</HTML>
# Now, let's play design patterns freaks!!!11 :D
require 'phpfashion_com_exercise'
module LostInTranslation
# First, we need to untangle the "mess" in WebPage
# Everything needs to be MUCH MORE complicated, right? :)
# In Ruby, we just perform a little sugery on out
# WebPage class. Let's do this!
::Crawler::WebPage.class_eval do
@cache = nil
def self.load(url)
url = URI.parse(url)
url.path = '/' if url.path =~ /^$/
client = Net::HTTP.start(url.host, url.port)
response = client.request_get(url.path)
webpage = ::Crawler::WebPage.new( url.to_s, response.body, response.to_hash )
return webpage
end
end
# Gee, now this makes MUCH MORE sense!
# See, it's *storage* for chrissake, right in the name!!! :)
class WebPageStorage
@cache = Crawler::FileStorage.new
def self.cache; @cache; end
def self.load(url)
return cache.get( url ) if cache.exist?( url ) # Already cached
webpage = Crawler::WebPage.load( url )
cache.set(url.to_s, webpage) # Cache the result
return webpage
end
end
end
# We don't need no stinking tests, it's obvious, right?! :)
# We just `puts inspect` something, hell, it's Friday anyway!
puts "1/"
puts LostInTranslation::WebPageStorage.load('http://example.com').inspect
puts "---"
puts "2/"
puts LostInTranslation::WebPageStorage.load('http://example.com').inspect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment