Last active
June 1, 2016 14:07
-
-
Save dcode/056811a6c4a06ad76df6f732975b365a to your computer and use it in GitHub Desktop.
Takes in a list of CSV ordered by (rank,domain) (from Alexa, for instance) and browses the highest ranking more often.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python2 | |
| from multiprocessing import Pool | |
| from time import sleep | |
| from random import randint, gauss | |
| import os, sys | |
| import requests | |
| ## TODO | |
| # . recursively download linked resources: | |
| # . images | |
| # . javascript | |
| # . css | |
| # . others? | |
| # . follow redirects | |
| # . meta-refresh | |
| # . HTTP 300 codes | |
| # . Vary user-agent | |
| min_batch = 15 # Minimum time between batches is 15 secs | |
| max_batch = 900 # Maxmimum time between batches i 15 mins | |
| worker_sleep_min = 0 | |
| worker_sleep_max = 5 | |
| min_i = 1 | |
| max_i = 999 # Index of 1M | |
| mu = 500 # Pick the middle of 500 | |
| sigma = 150 # Guessing that 68.3% of traffic will be in top 150 | |
| USER_AGENT = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'} | |
| def web_request(url): | |
| sleep(randint(worker_sleep_min, worker_sleep_max)) | |
| r = requests.get(url, headers=USER_AGENT) | |
| return | |
| def mainloop(argv=[]): | |
| pool = Pool(processes=1) # start 6 worker processes | |
| if( len(argv) == 1): | |
| print "Usage: %s filename" % argv[0] | |
| sys.exit() | |
| filename = argv[1] | |
| alexa_sites = [] | |
| with open(filename,'ro') as f: | |
| alexa_sites = f.readlines() | |
| print "Read %d entries." % len(alexa_sites) | |
| while True: | |
| # queue up to 1000 requests | |
| res = None | |
| for x in range(1000): | |
| result = gauss(mu, sigma) | |
| while (min_i < result < max_i) == False: | |
| result = gauss(mu, sigma) | |
| # Get distance from mean | |
| offset = int(abs(mu - result)) | |
| url = "http://www.%s" % alexa_sites[offset].split(',')[1].strip() | |
| res = pool.apply_async(web_request, (url,)) | |
| print "Batch complete. Sleeping." | |
| sleep(randint(min_batch, max_batch)) | |
| pool.close() | |
| pool.join() | |
| if __name__ == '__main__': | |
| mainloop(sys.argv) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python2 | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import sys | |
| from math import ceil | |
| from time import sleep | |
| from random import randint | |
| BASE_URL='http://www.alexa.com/topsites/countries;%d/%s' | |
| USER_AGENT = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'} | |
| if __name__ == '__main__': | |
| if len(sys.argv) != 3: | |
| sys.stderr.write('Usage: COUNTRY-CODE TOP-N\n') | |
| sys.exit(1) | |
| country_code = sys.argv[1].upper() | |
| number = int(sys.argv[2]) | |
| delimiter = ',' | |
| page_numbers = int(ceil(number/25.0)) | |
| for page_num in range(0, page_numbers): | |
| response = requests.get(BASE_URL % (page_num, country_code), headers=USER_AGENT) | |
| soup = BeautifulSoup(response.text, 'lxml') | |
| bullets = soup.find_all('li', {'class':'site-listing'}) | |
| for bullet in bullets: | |
| rank = bullet.find('div', {'class':"count"}).get_text() | |
| site = bullet.find('div', {'class': 'desc-container'}).p.a.get_text().lower() | |
| print('%s%s%s' % (rank, delimiter, site)) | |
| sleep(randint(0,5)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment