Skip to content

Instantly share code, notes, and snippets.

@rgarner
Created July 24, 2013 11:23
Show Gist options
  • Save rgarner/6069749 to your computer and use it in GitHub Desktop.
Save rgarner/6069749 to your computer and use it in GitHub Desktop.
BLURI - URI canonicalisation for Business Link, as was. Has some BL-specific bits, but otherwise good for querystring-as-hash sorting/deleting. Has own spec. One careful owner. Requires `addressable` gem, because, well, `URI` is a bit broken.
require 'addressable/uri'
module URI
##
# Extends a hash with query string reordering/deleting capabilities
module QueryHash
def ordered_query_string(*args)
unmentioned_keys = keys.reject { |key| args.include?(key.to_s) || args.include?(key.to_sym) }
(
args.uniq.collect { |key| render_value(key, self[key]) }.reject { |i| i.nil? } +
unmentioned_keys.collect { |key| render_value(key, self[key]) }
).join('&')
end
def delete_keys(*args)
args.uniq.collect { |key| delete(key.to_s) }
to_s
end
def [](key)
item = super key
item = super(key.to_s) if item.nil? || item.length == 0
item.class == Array && item.length == 0 ? nil : item
end
def to_s
keys.collect { |key| render_value(key, self[key]) }.join('&')
end
private
def render_value(key, value)
return nil if value.nil?
return value.collect { |el| render_value(key, el) }.join('&') if value.kind_of? Array
"#{key}=#{CGI::escape(value)}"
end
end
##
# A URI class with a bit extra for messing about with query strings
#
class BLURI < URI::HTTP
extend Forwardable
def_delegators :@uri, :scheme, :path, :host, :host=, :query, :to_s
def initialize(uri_str)
@uri = ::Addressable::URI.parse(uri_str)
raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri
end
def query_hash
@query_hash ||
(
@query_hash = CGI::parse(self.query)
@query_hash.each_pair { |k, v| @query_hash[k] = v[0] if v.length == 1 }
@query_hash.extend QueryHash
)
end
def query=(query_str)
@query_hash = nil
@uri.query = query_str
end
def self.parse(uri_str)
# Deal with known URI spec breaks - leading/trailing spaces and unencoded entities
if uri_str.is_a? String
uri_str = uri_str.strip.gsub(' ', '%20')
uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/
end
BLURI.new(uri_str)
end
def has_query?
%w{http https}.include?(@uri.scheme) && query
end
#
# Reorder the query string according to symbols or string key values
# passed in in order
#
def reorder_query_string!(*args)
return self unless has_query?
self.query = query_hash.ordered_query_string(*args)
self
end
def delete_query_keys!(*args)
return self unless has_query?
self.query = query_hash.delete_keys(*args)
self
end
def []=(key, value)
return self unless has_query?
query_hash[key] = value
self.query = query_hash.to_s
self
end
def canonicalize!
# 'page' for contact is useless, as is 'contactUs'
self.delete_query_keys!(:contactUs, :page) if path.include?('action/findcontactdetail')
self.host = host.sub! 'online.', 'www.' if path.include?('action/findcontact')
# r codes are removed
self.delete_query_keys_matching! { |k| k =~ /^r\./ }
# itemId and type are served the wrong way round to the canonical url
self.reorder_query_string!(:itemId, :type)
end
def delete_query_keys_matching!(&block)
return self unless has_query?
self.query = query_hash.delete_if(&block).to_s
self
end
end
end
module Kernel
def BLURI(uri_str)
::URI::BLURI.parse(uri_str)
end
module_function :BLURI
end
require "spec_helper"
require 'bluri'
describe URI::BLURI do
ITEM_URI = 'http://www.businesslink.gov.uk/bdotg/action/detail?type=RESOURCES&itemId=1081912559'
RCODE_ITEM_URI = 'http://www.businesslink.gov.uk/bdotg/action/detail?type=RESOURCES&r.l1=2&r.l2=3&itemId=1081912559'
EXPECTED_QUERY = 'itemId=1081912559&type=RESOURCES'
it "should be an HTTP URI" do
bluri = BLURI("http://some.where.com")
bluri.should be_a URI::HTTP
end
it "should not allow other schemes" do
lambda { BLURI('ftp://foo').should raise_error(ArgumentError) }
end
it "should not allow nil" do
lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError)
end
it "should support scheme" do
BLURI('http://foo').scheme.should == 'http'
end
it "should support host" do
BLURI('http://foo').host.should == 'foo'
end
it "should support path" do
BLURI('http://foo/a/path').path.should == '/a/path'
end
it "should support query" do
BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo'
end
it "should support mailto:someone@somewhere" do
BLURI('mailto:[email protected]').to_s.should == 'mailto:[email protected]'
end
it "should correct unencoded ampersands in mailto" do # http://www.faqs.org/rfcs/rfc2368.html
BLURI('mailto:fruit&[email protected]').to_s.should == 'mailto:fruit%[email protected]'
end
it "should correct trailing spaces" do
BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk'
end
it "should correct leading spaces" do
BLURI(' http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk'
end
describe "Query string parsing" do
before do
@bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE')
end
it "indexes the query string" do
@bluri.query_hash['itemid'].should == '1'
end
it "allows indexing by symbol" do
@bluri.query_hash[:itemid].should == '1'
end
it "should show nil for absent items" do
@bluri.query_hash[:eerie_flash].should == nil
end
it "indexes the query string" do
@bluri.query_hash['type'].should == 'RESOURCE'
end
it "allows setting of the query" do
@bluri.query = "furry=really"
@bluri.to_s.should == 'http://some.com/a/path?furry=really'
end
describe "reordering the query string" do
it "allows sorting of the query string by in-place replacement" do
@bluri.reorder_query_string!(:type, :itemid, :type)
@bluri.query.should == 'type=RESOURCE&itemid=1'
end
it "should bunch repeated items up" do
bluri = BLURI('http://foo?itemid=1&type=2&itemid=3').reorder_query_string!(:itemid, :type)
bluri.query.should == 'itemid=1&itemid=3&type=2'
end
it "should leave out items that weren't there" do
bluri = BLURI('http://foo?itemid=1&type=RESOURCE')
bluri.reorder_query_string!(:granny_smith, :itemid)
bluri.to_s.should_not include('granny_smith')
end
it "should not add a query string to a URL without one" do
bluri = BLURI('http://foo')
bluri.reorder_query_string!(:things)
bluri.to_s.should == 'http://foo'
end
it "should preserve items which weren't mentioned, but at the end" do
bluri = BLURI('http://foo?q1=1&q2=2&q3=3')
bluri.reorder_query_string!(:q3, :q1)
bluri.query.should == 'q3=3&q1=1&q2=2'
end
it "should leave unmentioned singles alone" do
TOPIC_URI = 'http://www.businesslink.gov.uk/bdotg/action/layer?topicId=1074450344'
bluri = BLURI(TOPIC_URI)
bluri.reorder_query_string!(:itemid, :type)
bluri.to_s.should == TOPIC_URI
end
it "should handle cased params" do
bluri = BLURI(ITEM_URI).reorder_query_string!(:itemId, :type)
bluri.query.should == EXPECTED_QUERY
end
end
end
describe "Deleting parts" do
it "should remove specified parts" do
BLURI('http://foo?q=1&q2=2&q3=3&q4=4').delete_query_keys!(:q2, :q3).
query.should == 'q=1&q4=4'
end
it "should leave parts not present" do
BLURI('http://foo?q=1').delete_query_keys!(:not_present).query.should == 'q=1'
end
it "should allow removal of keys matching a regex" do
BLURI('http://foo?q=1&r1.l=2&r2.r=3&p=2').delete_query_keys_matching! { |k, v| k =~ /r[0-9]\..+/ }.
query.should == 'q=1&p=2'
end
end
describe "changing query keys" do
it "should let us alter parameters" do
uri = BLURI('http://foo?q=1&r=2')
uri['q'] = '3'
uri.to_s.should == 'http://foo?q=3&r=2'
end
end
describe "Canonicalization" do
it "should reorder item and type and remove r-codes" do
BLURI(RCODE_ITEM_URI).canonicalize!.query.should include(EXPECTED_QUERY)
end
it "should order page query elements" do
BLURI('http://online.businesslink.gov.uk/bdotg/action/findcontactbrowse?letter=K&page=1&topicId=1074537159').
canonicalize!.query.should == 'letter=K&page=1&topicId=1074537159'
end
describe "Contact URLs" do
subject do
BLURI('https://online.businesslink.gov.uk/bdotg/action/findcontactdetail?itemId=1074044196&type=CONTACT&contactUs=&page=1').
canonicalize!
end
it "should remove contactUs and page for contacts" do
subject.query.should == 'itemId=1074044196&type=CONTACT'
end
it "should replace online. with www. to line up with canonicals" do
subject.host.should == 'www.businesslink.gov.uk'
end
it "should work for contactbrowse" do
BLURI('http://online.businesslink.gov.uk/bdotg/action/findcontactbrowse?page=1&amp;r.l1=1073909872&amp;topicId=1074537116').
canonicalize!.to_s.should == 'http://www.businesslink.gov.uk/bdotg/action/findcontactbrowse?page=1&topicId=1074537116'
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment