Created
February 17, 2014 17:26
-
-
Save jjam3774/a1ab6be8324f8596607b to your computer and use it in GitHub Desktop.
Test Load a page...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <codecell> | |
from httplib import HTTPConnection | |
import time | |
import re | |
import random | |
# <codecell> | |
GOOGLEBOT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" | |
CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36" | |
# <codecell> | |
def test_url(url, user_agent = GOOGLEBOT_USER_AGENT): | |
''' | |
Tests if the thematic page can be get. Returns a triple | |
(success, reason, time_used). | |
''' | |
original_url = url | |
if url.startswith('http://'): | |
url = url.split('http://')[1] | |
i = url.find('/') | |
if i == -1: | |
return (False, 'Not a valid URL.', -1) | |
try: | |
time_used = time.time() | |
conn = HTTPConnection(url[:i]) | |
conn.request('GET', url[i:], headers={'user-agent':user_agent}) | |
res = conn.getresponse() | |
msg = res.read() | |
# with open(url.split('/')[-1] + '.json', 'w') as f: | |
# f.write(json.dumps(res.getheaders())) | |
# with open(url.split('/')[-1] + '.txt', 'w') as f: | |
# f.write(msg) | |
conn.close() | |
time_used = time.time() - time_used | |
m = re.search(r'rel=["]?canonical["]?\s+href=["]?%s["]?' % re.escape(original_url), msg) | |
if not m: | |
return (False, 'Response code %s for url: %s' % (str(res.status), original_url), time_used) | |
return (True, '', time_used) | |
except Exception, e: | |
return (False, str(e), -1) | |
# <codecell> | |
filename = 'neimanmarcus_urls.txt' | |
with open(filename) as f: | |
urls = f.read().split('\n') | |
# <codecell> | |
random.shuffle(urls) | |
for url in urls: | |
res = test_url(url, user_agent = CHROME_USER_AGENT) | |
print url, res | |
time.sleep(1) | |
# <codecell> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment