-
-
Save ProGM/a704b5e187ece157d73d to your computer and use it in GitHub Desktop.
normalizer for urls
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'normalize_url' | |
require 'rubygems' | |
require 'httpclient' | |
require 'net/http' | |
require 'uri' | |
def printer(text, value) | |
puts text + ": " + value.to_s | |
end | |
def addSlash(u) | |
u = u.to_s + "/" | |
u = URI.parse(u) | |
u | |
end | |
def getPathFrom(u) | |
begin | |
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' }) | |
rescue ArgumentError | |
puts "argument error, I'll add the final slash" | |
u = addSlash(u) | |
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' }) | |
end | |
req | |
end | |
def getResponse(u) | |
begin | |
response = Net::HTTP.get_response(URI(u)) | |
rescue InvalidURIError | |
puts "character error" | |
rescue SocketError, NameError | |
puts "I believe it's not a real url" | |
end | |
response | |
end | |
#redirect | |
def tryToFollow(uri_str, limit = 10, verbose) | |
# You should choose a better exception. | |
returned = "" | |
returned = uri_str.to_s | |
#puts "uri string in time to follow: " + returned | |
raise ArgumentError, 'too many HTTP redirects' if limit == 0 | |
response = getResponse(uri_str) | |
case response | |
when Net::HTTPSuccess then | |
#puts response.code | |
when Net::HTTPRedirection then | |
location = response['location'] | |
returned = location | |
#warn "redirected to #{location}" | |
tryToFollow(location, limit - 1) | |
else | |
response.value | |
end | |
returned | |
end | |
def addHTTP (input) | |
start = input[0..4] | |
starts = input[0..3] | |
if (!start.casecmp("https").zero? && !starts.casecmp("http").zero?) | |
input = "http://"+input | |
end | |
input | |
end | |
#remove the s in https | |
def removeSinHttp(i, verbose) | |
if(verbose==1) | |
puts "prima di reg exp: " + i.to_s | |
end | |
if i.match(/^https/) | |
i = "http" + i[5..-1] | |
end | |
if (verbose==1) | |
puts "dopo reg exp: " + i.to_s | |
end | |
i | |
end | |
#basic normalization of url | |
def norm(i) | |
NormalizeUrl.process(i) | |
i | |
end | |
def normalizetor(input, verbose) | |
begin | |
ris = addHTTP(input) | |
ris = tryToFollow(ris, 10, verbose) | |
ris = norm(ris) | |
ris = removeSinHttp(ris, verbose) | |
rescue SocketError, NameError | |
puts "this seems to be not a real url, I'll return fail" | |
ris = "fail" | |
rescue Interrupt | |
puts "you pressed CTRL + C" | |
ris = "fail" | |
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Errno::ENETUNREACH, | |
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => e | |
puts "Generic network error: " + e.to_s | |
ris = "fail" | |
end | |
ris | |
end | |
#inp = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=windows+10+OEM" | |
inp = "http://tinyurl.com/hdzfyrq" | |
#inp = "google.it" | |
#inp = "https://it.wikipedia.org/wiki/Tolosa" | |
#inp = "gatto" | |
#inp = "http://www.wired.it/internet/web/2015/05/05/15-pagine-errore-404/vcd" | |
#inp = "http://見.香港/" | |
#inp = "http://ar.wikipedia.org/wiki/ نجيب_محفوظ" | |
printer("INIZIALE", inp) | |
r = normalizetor(inp, 0) | |
printer("FINALE", r) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'normalize_url' | |
require 'rubygems' | |
require 'httpclient' | |
require 'net/http' | |
require 'uri' | |
def printer(text, value) | |
puts text + ": " + value.to_s | |
end | |
def addSlash(u) | |
u = u.to_s + "/" | |
u = URI.parse(u) | |
u | |
end | |
def getPathFrom(u) | |
begin | |
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' }) | |
rescue ArgumentError | |
puts "argument error, I'll add the final slash" | |
u = addSlash(u) | |
req = Net::HTTP::Get.new(u.path, { 'User-Agent' => 'Mozilla/5.0 (etc...)' }) | |
end | |
req | |
end | |
def getResponse(u) | |
begin | |
response = Net::HTTP.get_response(URI(u)) | |
rescue InvalidURIError | |
puts "character error" | |
rescue SocketError, NameError | |
puts "I believe it's not a real url" | |
end | |
response | |
end | |
#redirect | |
def tryToFollow(uri_str, limit = 10, verbose) | |
# You should choose a better exception. | |
returned = "" | |
returned = uri_str.to_s | |
#puts "uri string in time to follow: " + returned | |
raise ArgumentError, 'too many HTTP redirects' if limit == 0 | |
response = getResponse(uri_str) | |
case response | |
when Net::HTTPSuccess then | |
#puts response.code | |
when Net::HTTPRedirection then | |
location = response['location'] | |
returned = location | |
#warn "redirected to #{location}" | |
tryToFollow(location, limit - 1) | |
else | |
response.value | |
end | |
returned | |
end | |
def addHTTP (input) | |
start = input[0..4] | |
starts = input[0..3] | |
if (!start.casecmp("https").zero? && !starts.casecmp("http").zero?) | |
input = "http://"+input | |
end | |
input | |
end | |
#remove the s in https | |
def removeSinHttp(i, verbose) | |
if(verbose==1) | |
puts "prima di reg exp: " + i.to_s | |
end | |
if i.match(/^https/) | |
i = "http" + i[5..-1] | |
end | |
if (verbose==1) | |
puts "dopo reg exp: " + i.to_s | |
end | |
i | |
end | |
#basic normalization of url | |
def norm(i) | |
NormalizeUrl.process(i) | |
i | |
end | |
def normalizetor(input, verbose) | |
begin | |
ris = addHTTP(input) | |
ris = tryToFollow(ris, 10, verbose) | |
ris = norm(ris) | |
ris = removeSinHttp(ris, verbose) | |
rescue SocketError, NameError | |
puts "this seems to be not a real url, I'll return fail" | |
ris = "fail" | |
rescue Interrupt | |
puts "you pressed CTRL + C" | |
ris = "fail" | |
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Errno::ENETUNREACH, | |
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => e | |
puts "Generic network error: " + e.to_s | |
ris = "fail" | |
end | |
ris | |
end | |
#inp = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=windows+10+OEM" | |
inp = "http://tinyurl.com/hdzfyrq" | |
#inp = "google.it" | |
#inp = "https://it.wikipedia.org/wiki/Tolosa" | |
#inp = "gatto" | |
#inp = "http://www.wired.it/internet/web/2015/05/05/15-pagine-errore-404/vcd" | |
#inp = "http://見.香港/" | |
#inp = "http://ar.wikipedia.org/wiki/ نجيب_محفوظ" | |
printer("INIZIALE", inp) | |
r = normalizetor(inp, 0) | |
printer("FINALE", r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment