Skip to content

Instantly share code, notes, and snippets.

@ProGM
Forked from agb91/Normalize.rb
Created January 26, 2016 14:36
Show Gist options
  • Save ProGM/a704b5e187ece157d73d to your computer and use it in GitHub Desktop.
Save ProGM/a704b5e187ece157d73d to your computer and use it in GitHub Desktop.
normalizer for urls
require 'normalize_url'
require 'rubygems'
require 'httpclient'
require 'net/http'
require 'uri'
def printer(text, value)
puts text + ": " + value.to_s
end
def addSlash(u)
u = u.to_s + "/"
u = URI.parse(u)
u
end
def getPathFrom(u)
begin
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' })
rescue ArgumentError
puts "argument error, I'll add the final slash"
u = addSlash(u)
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' })
end
req
end
def getResponse(u)
begin
response = Net::HTTP.get_response(URI(u))
rescue InvalidURIError
puts "character error"
rescue SocketError, NameError
puts "I believe it's not a real url"
end
response
end
#redirect
def tryToFollow(uri_str, limit = 10, verbose)
# You should choose a better exception.
returned = ""
returned = uri_str.to_s
#puts "uri string in time to follow: " + returned
raise ArgumentError, 'too many HTTP redirects' if limit == 0
response = getResponse(uri_str)
case response
when Net::HTTPSuccess then
#puts response.code
when Net::HTTPRedirection then
location = response['location']
returned = location
#warn "redirected to #{location}"
tryToFollow(location, limit - 1)
else
response.value
end
returned
end
def addHTTP (input)
start = input[0..4]
starts = input[0..3]
if (!start.casecmp("https").zero? && !starts.casecmp("http").zero?)
input = "http://"+input
end
input
end
#remove the s in https
def removeSinHttp(i, verbose)
if(verbose==1)
puts "prima di reg exp: " + i.to_s
end
if i.match(/^https/)
i = "http" + i[5..-1]
end
if (verbose==1)
puts "dopo reg exp: " + i.to_s
end
i
end
#basic normalization of url
def norm(i)
NormalizeUrl.process(i)
i
end
def normalizetor(input, verbose)
begin
ris = addHTTP(input)
ris = tryToFollow(ris, 10, verbose)
ris = norm(ris)
ris = removeSinHttp(ris, verbose)
rescue SocketError, NameError
puts "this seems to be not a real url, I'll return fail"
ris = "fail"
rescue Interrupt
puts "you pressed CTRL + C"
ris = "fail"
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Errno::ENETUNREACH,
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => e
puts "Generic network error: " + e.to_s
ris = "fail"
end
ris
end
#inp = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=windows+10+OEM"
inp = "http://tinyurl.com/hdzfyrq"
#inp = "google.it"
#inp = "https://it.wikipedia.org/wiki/Tolosa"
#inp = "gatto"
#inp = "http://www.wired.it/internet/web/2015/05/05/15-pagine-errore-404/vcd"
#inp = "http://見.香港/"
#inp = "http://ar.wikipedia.org/wiki/ نجيب_محفوظ"
printer("INIZIALE", inp)
r = normalizetor(inp, 0)
printer("FINALE", r)
require 'normalize_url'
require 'rubygems'
require 'httpclient'
require 'net/http'
require 'uri'
def printer(text, value)
puts text + ": " + value.to_s
end
def addSlash(u)
u = u.to_s + "/"
u = URI.parse(u)
u
end
def getPathFrom(u)
begin
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' })
rescue ArgumentError
puts "argument error, I'll add the final slash"
u = addSlash(u)
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' })
end
req
end
def getResponse(u)
begin
response = Net::HTTP.get_response(URI(u))
rescue InvalidURIError
puts "character error"
rescue SocketError, NameError
puts "I believe it's not a real url"
end
response
end
#redirect
def tryToFollow(uri_str, limit = 10, verbose)
# You should choose a better exception.
returned = ""
returned = uri_str.to_s
#puts "uri string in time to follow: " + returned
raise ArgumentError, 'too many HTTP redirects' if limit == 0
response = getResponse(uri_str)
case response
when Net::HTTPSuccess then
#puts response.code
when Net::HTTPRedirection then
location = response['location']
returned = location
#warn "redirected to #{location}"
tryToFollow(location, limit - 1)
else
response.value
end
returned
end
def addHTTP (input)
start = input[0..4]
starts = input[0..3]
if (!start.casecmp("https").zero? && !starts.casecmp("http").zero?)
input = "http://"+input
end
input
end
#remove the s in https
def removeSinHttp(i, verbose)
if(verbose==1)
puts "prima di reg exp: " + i.to_s
end
if i.match(/^https/)
i = "http" + i[5..-1]
end
if (verbose==1)
puts "dopo reg exp: " + i.to_s
end
i
end
#basic normalization of url
def norm(i)
NormalizeUrl.process(i)
i
end
def normalizetor(input, verbose)
begin
ris = addHTTP(input)
ris = tryToFollow(ris, 10, verbose)
ris = norm(ris)
ris = removeSinHttp(ris, verbose)
rescue SocketError, NameError
puts "this seems to be not a real url, I'll return fail"
ris = "fail"
rescue Interrupt
puts "you pressed CTRL + C"
ris = "fail"
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Errno::ENETUNREACH,
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => e
puts "Generic network error: " + e.to_s
ris = "fail"
end
ris
end
#inp = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=windows+10+OEM"
inp = "http://tinyurl.com/hdzfyrq"
#inp = "google.it"
#inp = "https://it.wikipedia.org/wiki/Tolosa"
#inp = "gatto"
#inp = "http://www.wired.it/internet/web/2015/05/05/15-pagine-errore-404/vcd"
#inp = "http://見.香港/"
#inp = "http://ar.wikipedia.org/wiki/ نجيب_محفوظ"
printer("INIZIALE", inp)
r = normalizetor(inp, 0)
printer("FINALE", r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment