Skip to content

Instantly share code, notes, and snippets.

@evilpie
Last active August 10, 2016 13:29
Show Gist options
  • Save evilpie/54a7b5c0f429b62ed953 to your computer and use it in GitHub Desktop.
Save evilpie/54a7b5c0f429b62ed953 to your computer and use it in GitHub Desktop.
import csv
import requests
from bs4 import BeautifulSoup
import os
import uuid
import threading
import urlparse
import curses
f = open("top-1m.csv")
count = 0
sites = []
for row in csv.reader(f):
sites.append(row[1])
count += 1
if count == 10000:
break
import re
s = re.compile("[^a-zA-Z\/\:\-. ]")
def filter_text(str):
return s.sub("", str)
def scrap(site, window, num):
def text(str):
length = 100
window.hline(num, 35, " ", length)
window.addnstr(num, 35, filter_text(str), length)
window.refresh()
text("downloading")
r = requests.get("http://" + site, timeout=2)
text("parsing")
soup = BeautifulSoup(r.content, "html.parser")
directory = "d/" + site
os.mkdir(directory)
for script in soup.find_all("script"):
src = script.get("src")
if src:
f = open(directory + "/" + str(uuid.uuid4()) + "src", "wb")
src = urlparse.urljoin(r.url, src)
text("loading " + src)
try:
r = requests.get(src, timeout=2)
f.write(r.text.encode("utf-8"))
except Exception as e:
text(str(e))
else:
text("saving inline")
f = open(directory + "/" + str(uuid.uuid4()) + "inline", "wb")
f.write(script.text.encode("utf-8"))
class Scrapper(threading.Thread):
def __init__(self, sites, num, window):
threading.Thread.__init__(self)
self.sites = sites
self.num = num
self.window = window
def run(self):
for (index, site) in enumerate(self.sites):
self.window.hline(self.num, 0, " ", 70)
self.window.addnstr(self.num, 0, "{num}/{index:4}: {site}".format(num=self.num, index=index, site=site), 30)
self.window.refresh()
try:
scrap(site, self.window, self.num)
except Exception as e:
self.window.hline(self.num, 35, " ", 35)
self.window.addnstr(self.num, 35, str(e), 35)
self.window.refresh()
def partition(lst, n):
division = len(lst) / float(n)
return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n) ]
os.mkdir("d")
window = curses.initscr()
curses.noecho()
curses.curs_set(0)
num = 0
for sites in partition(sites, 10):
thread = Scrapper(sites, num, window)
thread.start()
num += 1
while True:
try:
char = window.getch()
except KeyboardInterrupt:
curses.endwin()
os._exit(1)
if char == ord("q"):
curses.endwin()
os._exit(1)
curses.endwin()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment