Created
July 24, 2013 11:23
-
-
Save rgarner/6069749 to your computer and use it in GitHub Desktop.
BLURI - URI canonicalisation for Business Link, as was. Has some BL-specific bits, but otherwise good for querystring-as-hash sorting/deleting. Has own spec. One careful owner. Requires `addressable` gem, because, well, `URI` is a bit broken.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'addressable/uri' | |
module URI | |
## | |
# Extends a hash with query string reordering/deleting capabilities | |
module QueryHash | |
def ordered_query_string(*args) | |
unmentioned_keys = keys.reject { |key| args.include?(key.to_s) || args.include?(key.to_sym) } | |
( | |
args.uniq.collect { |key| render_value(key, self[key]) }.reject { |i| i.nil? } + | |
unmentioned_keys.collect { |key| render_value(key, self[key]) } | |
).join('&') | |
end | |
def delete_keys(*args) | |
args.uniq.collect { |key| delete(key.to_s) } | |
to_s | |
end | |
def [](key) | |
item = super key | |
item = super(key.to_s) if item.nil? || item.length == 0 | |
item.class == Array && item.length == 0 ? nil : item | |
end | |
def to_s | |
keys.collect { |key| render_value(key, self[key]) }.join('&') | |
end | |
private | |
def render_value(key, value) | |
return nil if value.nil? | |
return value.collect { |el| render_value(key, el) }.join('&') if value.kind_of? Array | |
"#{key}=#{CGI::escape(value)}" | |
end | |
end | |
## | |
# A URI class with a bit extra for messing about with query strings | |
# | |
class BLURI < URI::HTTP | |
extend Forwardable | |
def_delegators :@uri, :scheme, :path, :host, :host=, :query, :to_s | |
def initialize(uri_str) | |
@uri = ::Addressable::URI.parse(uri_str) | |
raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri | |
end | |
def query_hash | |
@query_hash || | |
( | |
@query_hash = CGI::parse(self.query) | |
@query_hash.each_pair { |k, v| @query_hash[k] = v[0] if v.length == 1 } | |
@query_hash.extend QueryHash | |
) | |
end | |
def query=(query_str) | |
@query_hash = nil | |
@uri.query = query_str | |
end | |
def self.parse(uri_str) | |
# Deal with known URI spec breaks - leading/trailing spaces and unencoded entities | |
if uri_str.is_a? String | |
uri_str = uri_str.strip.gsub(' ', '%20') | |
uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/ | |
end | |
BLURI.new(uri_str) | |
end | |
def has_query? | |
%w{http https}.include?(@uri.scheme) && query | |
end | |
# | |
# Reorder the query string according to symbols or string key values | |
# passed in in order | |
# | |
def reorder_query_string!(*args) | |
return self unless has_query? | |
self.query = query_hash.ordered_query_string(*args) | |
self | |
end | |
def delete_query_keys!(*args) | |
return self unless has_query? | |
self.query = query_hash.delete_keys(*args) | |
self | |
end | |
def []=(key, value) | |
return self unless has_query? | |
query_hash[key] = value | |
self.query = query_hash.to_s | |
self | |
end | |
def canonicalize! | |
# 'page' for contact is useless, as is 'contactUs' | |
self.delete_query_keys!(:contactUs, :page) if path.include?('action/findcontactdetail') | |
self.host = host.sub! 'online.', 'www.' if path.include?('action/findcontact') | |
# r codes are removed | |
self.delete_query_keys_matching! { |k| k =~ /^r\./ } | |
# itemId and type are served the wrong way round to the canonical url | |
self.reorder_query_string!(:itemId, :type) | |
end | |
def delete_query_keys_matching!(&block) | |
return self unless has_query? | |
self.query = query_hash.delete_if(&block).to_s | |
self | |
end | |
end | |
end | |
module Kernel | |
def BLURI(uri_str) | |
::URI::BLURI.parse(uri_str) | |
end | |
module_function :BLURI | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "spec_helper" | |
require 'bluri' | |
describe URI::BLURI do | |
ITEM_URI = 'http://www.businesslink.gov.uk/bdotg/action/detail?type=RESOURCES&itemId=1081912559' | |
RCODE_ITEM_URI = 'http://www.businesslink.gov.uk/bdotg/action/detail?type=RESOURCES&r.l1=2&r.l2=3&itemId=1081912559' | |
EXPECTED_QUERY = 'itemId=1081912559&type=RESOURCES' | |
it "should be an HTTP URI" do | |
bluri = BLURI("http://some.where.com") | |
bluri.should be_a URI::HTTP | |
end | |
it "should not allow other schemes" do | |
lambda { BLURI('ftp://foo').should raise_error(ArgumentError) } | |
end | |
it "should not allow nil" do | |
lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError) | |
end | |
it "should support scheme" do | |
BLURI('http://foo').scheme.should == 'http' | |
end | |
it "should support host" do | |
BLURI('http://foo').host.should == 'foo' | |
end | |
it "should support path" do | |
BLURI('http://foo/a/path').path.should == '/a/path' | |
end | |
it "should support query" do | |
BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo' | |
end | |
it "should support mailto:someone@somewhere" do | |
BLURI('mailto:[email protected]').to_s.should == 'mailto:[email protected]' | |
end | |
it "should correct unencoded ampersands in mailto" do # http://www.faqs.org/rfcs/rfc2368.html | |
BLURI('mailto:fruit&[email protected]').to_s.should == 'mailto:fruit%[email protected]' | |
end | |
it "should correct trailing spaces" do | |
BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk' | |
end | |
it "should correct leading spaces" do | |
BLURI(' http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk' | |
end | |
describe "Query string parsing" do | |
before do | |
@bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE') | |
end | |
it "indexes the query string" do | |
@bluri.query_hash['itemid'].should == '1' | |
end | |
it "allows indexing by symbol" do | |
@bluri.query_hash[:itemid].should == '1' | |
end | |
it "should show nil for absent items" do | |
@bluri.query_hash[:eerie_flash].should == nil | |
end | |
it "indexes the query string" do | |
@bluri.query_hash['type'].should == 'RESOURCE' | |
end | |
it "allows setting of the query" do | |
@bluri.query = "furry=really" | |
@bluri.to_s.should == 'http://some.com/a/path?furry=really' | |
end | |
describe "reordering the query string" do | |
it "allows sorting of the query string by in-place replacement" do | |
@bluri.reorder_query_string!(:type, :itemid, :type) | |
@bluri.query.should == 'type=RESOURCE&itemid=1' | |
end | |
it "should bunch repeated items up" do | |
bluri = BLURI('http://foo?itemid=1&type=2&itemid=3').reorder_query_string!(:itemid, :type) | |
bluri.query.should == 'itemid=1&itemid=3&type=2' | |
end | |
it "should leave out items that weren't there" do | |
bluri = BLURI('http://foo?itemid=1&type=RESOURCE') | |
bluri.reorder_query_string!(:granny_smith, :itemid) | |
bluri.to_s.should_not include('granny_smith') | |
end | |
it "should not add a query string to a URL without one" do | |
bluri = BLURI('http://foo') | |
bluri.reorder_query_string!(:things) | |
bluri.to_s.should == 'http://foo' | |
end | |
it "should preserve items which weren't mentioned, but at the end" do | |
bluri = BLURI('http://foo?q1=1&q2=2&q3=3') | |
bluri.reorder_query_string!(:q3, :q1) | |
bluri.query.should == 'q3=3&q1=1&q2=2' | |
end | |
it "should leave unmentioned singles alone" do | |
TOPIC_URI = 'http://www.businesslink.gov.uk/bdotg/action/layer?topicId=1074450344' | |
bluri = BLURI(TOPIC_URI) | |
bluri.reorder_query_string!(:itemid, :type) | |
bluri.to_s.should == TOPIC_URI | |
end | |
it "should handle cased params" do | |
bluri = BLURI(ITEM_URI).reorder_query_string!(:itemId, :type) | |
bluri.query.should == EXPECTED_QUERY | |
end | |
end | |
end | |
describe "Deleting parts" do | |
it "should remove specified parts" do | |
BLURI('http://foo?q=1&q2=2&q3=3&q4=4').delete_query_keys!(:q2, :q3). | |
query.should == 'q=1&q4=4' | |
end | |
it "should leave parts not present" do | |
BLURI('http://foo?q=1').delete_query_keys!(:not_present).query.should == 'q=1' | |
end | |
it "should allow removal of keys matching a regex" do | |
BLURI('http://foo?q=1&r1.l=2&r2.r=3&p=2').delete_query_keys_matching! { |k, v| k =~ /r[0-9]\..+/ }. | |
query.should == 'q=1&p=2' | |
end | |
end | |
describe "changing query keys" do | |
it "should let us alter parameters" do | |
uri = BLURI('http://foo?q=1&r=2') | |
uri['q'] = '3' | |
uri.to_s.should == 'http://foo?q=3&r=2' | |
end | |
end | |
describe "Canonicalization" do | |
it "should reorder item and type and remove r-codes" do | |
BLURI(RCODE_ITEM_URI).canonicalize!.query.should include(EXPECTED_QUERY) | |
end | |
it "should order page query elements" do | |
BLURI('http://online.businesslink.gov.uk/bdotg/action/findcontactbrowse?letter=K&page=1&topicId=1074537159'). | |
canonicalize!.query.should == 'letter=K&page=1&topicId=1074537159' | |
end | |
describe "Contact URLs" do | |
subject do | |
BLURI('https://online.businesslink.gov.uk/bdotg/action/findcontactdetail?itemId=1074044196&type=CONTACT&contactUs=&page=1'). | |
canonicalize! | |
end | |
it "should remove contactUs and page for contacts" do | |
subject.query.should == 'itemId=1074044196&type=CONTACT' | |
end | |
it "should replace online. with www. to line up with canonicals" do | |
subject.host.should == 'www.businesslink.gov.uk' | |
end | |
it "should work for contactbrowse" do | |
BLURI('http://online.businesslink.gov.uk/bdotg/action/findcontactbrowse?page=1&r.l1=1073909872&topicId=1074537116'). | |
canonicalize!.to_s.should == 'http://www.businesslink.gov.uk/bdotg/action/findcontactbrowse?page=1&topicId=1074537116' | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment