Skip to content

Instantly share code, notes, and snippets.

@dcode
Last active June 1, 2016 14:07
Show Gist options
  • Save dcode/056811a6c4a06ad76df6f732975b365a to your computer and use it in GitHub Desktop.
Save dcode/056811a6c4a06ad76df6f732975b365a to your computer and use it in GitHub Desktop.
Takes in a list of CSV ordered by (rank,domain) (from Alexa, for instance) and browses the highest ranking more often.
#!/usr/bin/env python2
from multiprocessing import Pool
from time import sleep
from random import randint, gauss
import os, sys
import requests
## TODO
# . recursively download linked resources:
# . images
# . javascript
# . css
# . others?
# . follow redirects
# . meta-refresh
# . HTTP 300 codes
# . Vary user-agent
min_batch = 15 # Minimum time between batches is 15 secs
max_batch = 900 # Maxmimum time between batches i 15 mins
worker_sleep_min = 0
worker_sleep_max = 5
min_i = 1
max_i = 999 # Index of 1M
mu = 500 # Pick the middle of 500
sigma = 150 # Guessing that 68.3% of traffic will be in top 150
USER_AGENT = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'}
def web_request(url):
sleep(randint(worker_sleep_min, worker_sleep_max))
r = requests.get(url, headers=USER_AGENT)
return
def mainloop(argv=[]):
pool = Pool(processes=1) # start 6 worker processes
if( len(argv) == 1):
print "Usage: %s filename" % argv[0]
sys.exit()
filename = argv[1]
alexa_sites = []
with open(filename,'ro') as f:
alexa_sites = f.readlines()
print "Read %d entries." % len(alexa_sites)
while True:
# queue up to 1000 requests
res = None
for x in range(1000):
result = gauss(mu, sigma)
while (min_i < result < max_i) == False:
result = gauss(mu, sigma)
# Get distance from mean
offset = int(abs(mu - result))
url = "http://www.%s" % alexa_sites[offset].split(',')[1].strip()
res = pool.apply_async(web_request, (url,))
print "Batch complete. Sleeping."
sleep(randint(min_batch, max_batch))
pool.close()
pool.join()
if __name__ == '__main__':
mainloop(sys.argv)
#!/usr/bin/env python2
from bs4 import BeautifulSoup
import requests
import sys
from math import ceil
from time import sleep
from random import randint
BASE_URL='http://www.alexa.com/topsites/countries;%d/%s'
USER_AGENT = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'}
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.stderr.write('Usage: COUNTRY-CODE TOP-N\n')
sys.exit(1)
country_code = sys.argv[1].upper()
number = int(sys.argv[2])
delimiter = ','
page_numbers = int(ceil(number/25.0))
for page_num in range(0, page_numbers):
response = requests.get(BASE_URL % (page_num, country_code), headers=USER_AGENT)
soup = BeautifulSoup(response.text, 'lxml')
bullets = soup.find_all('li', {'class':'site-listing'})
for bullet in bullets:
rank = bullet.find('div', {'class':"count"}).get_text()
site = bullet.find('div', {'class': 'desc-container'}).p.a.get_text().lower()
print('%s%s%s' % (rank, delimiter, site))
sleep(randint(0,5))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment