Skip to content

Instantly share code, notes, and snippets.

@hamletbatista
Created March 12, 2019 22:44
Show Gist options
  • Save hamletbatista/665187fcd3e772d67f915fd7b256f7e6 to your computer and use it in GitHub Desktop.
Save hamletbatista/665187fcd3e772d67f915fd7b256f7e6 to your computer and use it in GitHub Desktop.
#Handling URL parameters
from urllib.parse import urlsplit, urlunsplit
def clean_url_params(url):
print(url)
u = urlsplit(url)
print(u.query)
#example output -> 'sizecode=99&sort='
url_params = parse_qsl(u.query)
print(url_params)
#example output -> [('sizecode', '99')]
#next let's sort the parameters so they are always in the same order
url_params.sort(key=lambda tup: tup[0]) # sorts in place by parameter name
#now we need to rebuild the URL
new_query = urlencode(url_params)
print(new_query)
#example output -> 'sizecode=99'
new_url = urlunsplit((u.scheme, u.netloc, u.path, new_query, ""))
print(new_url)
#example output -> 'http://www.example.com/brand/swirly/shopby?sizecode=99'
#Absolute source URLs linking to 404s from Search Console API: webmasters.urlcrawlerrorssamples.list
linkedFromUrls= [
"http://www.example.com/brand/swirly/shopby?sizecode=99&sort=",
"https://www.example.com/brand/swirly?sort=asc&sizecode=99",
"https://www.example.com/brand/swirly?sizecode=99&sort=asc",
]
#You might have parameters with no values. For example, sort=
clean_url_params(linkedFromUrls[0])
#Output: http://www.example.com/brand/swirly/shopby?sizecode=99&sort= -> http://www.example.com/brand/swirly/shopby?sizecode=99
#You might have parameters ordered differently
clean_url_params(linkedFromUrls[1])
clean_url_params(linkedFromUrls[2])
#Output:
# https://www.example.com/brand/swirly?sort=asc&sizecode=99 -> https://www.example.com/brand/swirly?sizecode=99&sort=asc
# https://www.example.com/brand/swirly?sizecode=99&sort=asc -> https://www.example.com/brand/swirly?sizecode=99&sort=asc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment