Last active
October 11, 2017 14:00
-
-
Save clopez/dcae6ba605e7fa521b5e989ca323f522 to your computer and use it in GitHub Desktop.
Check Alexa top N sites to see how many support a specific Content-Encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Author: Carlos Alberto Lopez Perez <[email protected]> | |
# License: MIT | |
# | |
# Check Alexa top N sites to see how many support a specific Content-Encoding | |
# | |
# Examples of use: | |
# * Check top 100 alexa for encoding gzip: | |
# $ ./alexa_check_content_encoding.py gzip 100 | |
# * Check top 500 alexa for encoding brotli: | |
# $ ./alexa_check_content_encoding.py br 500 | |
# * Check top 1000 alexa for encoding Zstandard: | |
# $ ./alexa_check_content_encoding.py zstd 1000 | |
# | |
import argparse | |
import io | |
import zipfile | |
import urllib.request, urllib.parse, urllib.error | |
def alexa_top_list(num=1000): | |
f = urllib.request.urlopen('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip') | |
buf = io.BytesIO(f.read()) | |
zfile = zipfile.ZipFile(buf) | |
buf = io.StringIO(zfile.read('top-1m.csv').decode('utf-8')) | |
alexa_list = [] | |
counter = 0 | |
for line in buf: | |
counter = counter + 1 | |
rank, domain = line.split(',') | |
if (int(rank.strip())) != counter: | |
raise ValueError("Something is wrong with the alexa data file. Maybe not sorted?") | |
alexa_list.append(domain.strip()) | |
if counter == num: | |
break | |
return alexa_list | |
def server_supports_encoding(encoding, url): | |
req_headers = { | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', | |
'Accept-Encoding': encoding, } | |
request = urllib.request.Request(url, headers=req_headers) | |
response = urllib.request.urlopen(request, timeout=5) | |
if 'Content-Encoding' in response.headers: | |
return encoding in response.headers['Content-Encoding'] | |
return False | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("encoding", type=str, help="Name of the codename of the encoding to test") | |
parser.add_argument("alexanumber", type=int, help="Number of alexa top n sites to check") | |
args = parser.parse_args() | |
encoding = args.encoding.strip() | |
alexa_number_domains = int(args.alexanumber) | |
iana_encoding_list = ["aes128gcm", "br", "compress", "deflate", "exi", "gzip", "identity", "pack200-gzip", "x-compress", "x-gzip"] | |
if encoding not in iana_encoding_list: | |
print ("\033[0;33mWARNING:\033[0m encoding %s is not on the IANA encoding list" %encoding) | |
print ("Check: https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding") | |
print ("And check you spelled it correcly (example: is \"br\" instead of \"brotli\")\n") | |
print("Querying alexa top %d domains for encoding %s ..." %(alexa_number_domains,encoding)) | |
domains_toquery = alexa_top_list(alexa_number_domains) | |
domains_support_encoding = 0 | |
domains_notsupport_encoding = 0 | |
domains_error = 0 | |
for domain in domains_toquery: | |
try: | |
# Check both http and https | |
if server_supports_encoding(encoding, "http://" + domain + "/") or server_supports_encoding(encoding, "https://" + domain + "/"): | |
print("\033[0;32mDomain %s supports encoding %s\033[0m" %(domain,encoding)) | |
domains_support_encoding += 1 | |
else: | |
print("\033[0;31mDomain %s NOT supports encoding %s\033[0m" %(domain,encoding)) | |
domains_notsupport_encoding += 1 | |
except KeyboardInterrupt: | |
raise | |
except: | |
print("Domain %s gave an error, likely timeout" %domain) | |
domains_error += 1 | |
pass | |
print("---- SUMMARY ----") | |
if domains_error > 0: | |
print("%d servers queried sucesfully (%d caused error)" %(len(domains_toquery) - domains_error, domains_error)) | |
else: | |
print("%d servers queried sucesfully" %len(domains_toquery)) | |
print("%d servers support encoding %s" %(domains_support_encoding, encoding)) | |
print("%d servers NOT support encoding %s" %(domains_notsupport_encoding, encoding)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment