-
-
Save jsuchal/226863 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# WebPage uses cache??? What the heck is cache in context of a webpage crawling process? It should only crawl pages. You need to inject some dummy cache to prevent caching. | |
# thumnail generation is an expensive job, thus lazy-loading. But you are not saving already generated thumbnails in cache. :( | |
# Nice tests. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Simple exercise based on call to action at | |
# http://phpfashion.com/cisty-programatorsky-experiment | |
require 'net/http' | |
require 'tmpdir' | |
require 'digest/md5' | |
require 'fileutils' | |
require 'pathname' | |
module Crawler | |
# Generic file-based storage. Do your own: MySQLStorage, etc. | |
# | |
# In real life, you'd create an abstract class to define the "interface" etc :P | |
# In real life, you'd need some Cache class to wrap the Storage. | |
# You need to ask the cache for data, not the storage. But we don't care here. | |
# And <b>first of all</b>, in real life you'd need some expiration logic :) | |
# | |
class FileStorage | |
def initialize(path=nil) | |
raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path) | |
path ||= Dir::tmpdir | |
@store = Pathname.new(path) | |
end | |
def set(key, value); File.open( @store.join(encode(key)), 'w' ) { |file| file << Marshal.dump(value) }; end | |
def get(key); Marshal.load(File.read( @store.join(encode(key)))); end | |
def exist?(key); File.exist? @store.join(encode(key)); end | |
private | |
def encode(key); Digest::MD5.hexdigest(key); end | |
end | |
# = Wrap a HTML page | |
# | |
# The +load+ method returns the representation of HTML page either from cache, | |
# or from the network (and caches it) | |
# | |
# == Usage | |
# | |
# require 'web_page' | |
# page = Crawler::WebPage.load('htpp://example.com') | |
# puts page.url | |
# puts page.body | |
# puts page.headers.inspect | |
# | |
class WebPage | |
@cache = Crawler::FileStorage.new | |
def self.load(url) | |
return cache.get( url ) if cache.exist?( url ) | |
url = URI.parse(url) | |
url.path = '/' if url.path =~ /^$/ | |
client = Net::HTTP.start(url.host, url.port) | |
response = client.request_get(url.path) | |
webpage = WebPage.new( url.to_s, response.body, response.to_hash ) | |
cache.set(url.to_s, webpage) | |
return webpage | |
end | |
attr_reader :url, :body, :headers | |
def initialize(url, body, headers) | |
@url, @body, @headers = url, body, headers | |
self | |
end | |
def self.cache; @cache; end | |
def thumbnail | |
@thumbnail ||= create_thumbnail | |
end | |
private | |
def create_thumbnail | |
# Thumbnail.new(url) ... Some expensive logic ... etc | |
end | |
end | |
end | |
if $0 == __FILE__ | |
require 'test/unit' | |
require 'rubygems' | |
require 'fakeweb' | |
require 'shoulda' | |
include Crawler | |
FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read | |
FakeWeb.allow_net_connect = false | |
class WebPageTest < Test::Unit::TestCase | |
context "When loading a URL, it" do | |
should "handle the request" do | |
assert_nothing_raised { @webpage = WebPage.load('http://example.com/') } | |
assert_not_nil @webpage | |
end | |
should "add trailing slash" do | |
assert_nothing_raised { @webpage = WebPage.load('http://example.com') } | |
assert_not_nil @webpage | |
end | |
should "return the URL with trailing slash back" do | |
@webpage = WebPage.load('http://example.com') | |
assert_equal 'http://example.com/', @webpage.url | |
end | |
should "parse the body" do | |
@webpage = WebPage.load('http://example.com') | |
assert_match /Example Web Page/, @webpage.body | |
end | |
should "parse the headers" do | |
@webpage = WebPage.load('http://example.com') | |
assert_not_nil @webpage.headers | |
assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s | |
end | |
should "have thumbnail" do | |
@webpage = WebPage.load('http://example.com') | |
assert_respond_to @webpage, :thumbnail | |
end | |
should "cache the response" do | |
@webpage = WebPage.load('http://example.com/') | |
assert_not_nil WebPage.cache.exist?( 'http://example.com/' ) | |
@cached = WebPage.cache.get( 'http://example.com/' ) | |
assert_instance_of WebPage, @cached | |
end | |
should "load valid web page from cache" do | |
@webpage = WebPage.load('http://example.com/') | |
@cached = WebPage.cache.get( 'http://example.com/' ) | |
assert_equal @webpage.url, @cached.url | |
assert_equal @webpage.body, @cached.body | |
assert_equal @webpage.headers, @cached.headers | |
end | |
end | |
# --------------------------------------------------------------------------- | |
context "FileStorage" do | |
setup do | |
@tmp_path = File.join(File.dirname(__FILE__), 'tmp') | |
FileUtils.mkdir_p @tmp_path | |
end | |
teardown do | |
FileUtils.rm_rf @tmp_path | |
end | |
should "be initialized with a valid path" do | |
assert_nothing_raised { @storage = FileStorage.new @tmp_path } | |
assert File.exist?(@tmp_path), "Path does not exist" | |
end | |
should "raise when initialized with invalid path" do | |
assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') } | |
end | |
should "not have data missing key" do | |
@storage = FileStorage.new @tmp_path | |
assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to') | |
end | |
should "have data for valid key" do | |
@storage = FileStorage.new @tmp_path | |
@storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] }) | |
assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!" | |
end | |
should "store and retrieve data" do | |
@storage = FileStorage.new @tmp_path | |
@storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] }) | |
assert_equal @storage.get('abc123')[:array], [1, 2, 3] | |
end | |
end | |
# --------------------------------------------------------------------------- | |
end | |
end | |
__END__ | |
HTTP/1.1 200 OK | |
Server: Apache/2.2.3 (Red Hat) | |
Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT | |
ETag: "b300b4-1b6-4059a80bfd280" | |
Accept-Ranges: bytes | |
Content-Type: text/html; charset=UTF-8 | |
Connection: Keep-Alive | |
Date: Fri, 30 Oct 2009 09:20:03 GMT | |
Age: 2361 | |
Content-Length: 438 | |
<HTML> | |
<HEAD> | |
<TITLE>Example Web Page</TITLE> | |
</HEAD> | |
<body> | |
<p>You have reached this web page by typing "example.com", | |
"example.net", | |
or "example.org" into your web browser.</p> | |
<p>These domain names are reserved for use in documentation and are not available | |
for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC | |
2606</a>, Section 3.</p> | |
</BODY> | |
</HTML> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Now, let's play design patterns freaks!!!11 :D | |
require 'phpfashion_com_exercise' | |
module LostInTranslation | |
# First, we need to untangle the "mess" in WebPage | |
# Everything needs to be MUCH MORE complicated, right? :) | |
# In Ruby, we just perform a little sugery on out | |
# WebPage class. Let's do this! | |
::Crawler::WebPage.class_eval do | |
@cache = nil | |
def self.load(url) | |
url = URI.parse(url) | |
url.path = '/' if url.path =~ /^$/ | |
client = Net::HTTP.start(url.host, url.port) | |
response = client.request_get(url.path) | |
webpage = ::Crawler::WebPage.new( url.to_s, response.body, response.to_hash ) | |
return webpage | |
end | |
end | |
# Gee, now this makes MUCH MORE sense! | |
# See, it's *storage* for chrissake, right in the name!!! :) | |
class WebPageStorage | |
@cache = Crawler::FileStorage.new | |
def self.cache; @cache; end | |
def self.load(url) | |
return cache.get( url ) if cache.exist?( url ) # Already cached | |
webpage = Crawler::WebPage.load( url ) | |
cache.set(url.to_s, webpage) # Cache the result | |
return webpage | |
end | |
end | |
end | |
# We don't need no stinking tests, it's obvious, right?! :) | |
# We just `puts inspect` something, hell, it's Friday anyway! | |
puts "1/" | |
puts LostInTranslation::WebPageStorage.load('http://example.com').inspect | |
puts "---" | |
puts "2/" | |
puts LostInTranslation::WebPageStorage.load('http://example.com').inspect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment