Created
February 17, 2014 17:27
-
-
Save jjam3774/d37d2467e2e3749c6208 to your computer and use it in GitHub Desktop.
load with akamai
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <codecell> | |
from httplib import HTTPConnection | |
import time | |
import re | |
import random | |
# <codecell> | |
GOOGLEBOT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" | |
CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36" | |
# <codecell> | |
def convert_to_akamai_url(url): | |
akamai_prefix = 'http://thematic-id-5017.brcdn.com/fetch_thematic/5017/gmtnjthiky37113i/v1/' | |
return akamai_prefix + url[28:].replace('-th.html', '.html') | |
# <codecell> | |
def test_url(url, match_url, user_agent = GOOGLEBOT_USER_AGENT): | |
''' | |
Tests if the thematic page can be get. Returns a triple | |
(success, reason, time_used). | |
''' | |
original_url = match_url | |
if url.startswith('http://'): | |
url = url.split('http://')[1] | |
i = url.find('/') | |
if i == -1: | |
return (False, 'Not a valid URL.', -1) | |
try: | |
time_used = time.time() | |
conn = HTTPConnection(url[:i]) | |
conn.request('GET', url[i:], headers={'user-agent':user_agent}) | |
res = conn.getresponse() | |
time_used = time.time() - time_used | |
msg = res.read() | |
# uncomment to write header file | |
# with open(url.split('/')[-1] + '.json', 'w') as f: | |
# f.write(json.dumps(res.getheaders())) | |
# with open(url.split('/')[-1] + '.txt', 'w') as f: | |
# f.write(msg) | |
conn.close() | |
m = re.search(r'rel=["]?canonical["]?\s+href=["]?%s["]?' % re.escape(original_url), msg) | |
if not m: | |
return (False, 'Response code %s for url: %s' % (str(res.status), original_url), time_used) | |
return (True, '', time_used) | |
except Exception, e: | |
return (False, str(e), -1) | |
# <codecell> | |
filename = 'neimanmarcus_urls.txt' | |
with open(filename) as f: | |
urls = f.read().split('\n') | |
# <codecell> | |
random.shuffle(urls) | |
for url in urls: | |
res = test_url(url, url, user_agent = CHROME_USER_AGENT) | |
print url, res | |
akamai_url = convert_to_akamai_url(url) | |
res_akamai = test_url(akamai_url, url, user_agent = CHROME_USER_AGENT) | |
print akamai_url, res_akamai | |
time.sleep(1) | |
# <codecell> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment