Created
September 2, 2019 01:20
-
-
Save jszym/bec209bb22541b849457df375472d34a to your computer and use it in GitHub Desktop.
A quick script to get rid of Google (UTM) tracking, as well as the tracking query strings on NYTimes URLs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import parse_qs, urlparse, urlencode, urlunparse | |
import copy | |
def clean_trackers_url(url): | |
url_obj = urlparse(url) | |
raw_query = parse_qs(url_obj.query) | |
clean_query = copy.deepcopy(raw_query) | |
# add query keys to ban (exact matches) | |
banned_keys = ["emc", "partner"] | |
# add query key prefixes to ban | |
banned_prefixes = ["utm_"] | |
for key in raw_query: | |
if key in banned_keys: | |
del clean_query[key] | |
for prefix in banned_prefixes: | |
if key.startswith(prefix): | |
del clean_query[key] | |
assembled_query = urlencode({key: clean_query[key][0] for key in clean_query}) | |
url_obj = url_obj._replace(query=assembled_query) | |
return urlunparse(url_obj) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment