Last active
August 10, 2016 13:29
-
-
Save evilpie/54a7b5c0f429b62ed953 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import uuid | |
import threading | |
import urlparse | |
import curses | |
f = open("top-1m.csv") | |
count = 0 | |
sites = [] | |
for row in csv.reader(f): | |
sites.append(row[1]) | |
count += 1 | |
if count == 10000: | |
break | |
import re | |
s = re.compile("[^a-zA-Z\/\:\-. ]") | |
def filter_text(str): | |
return s.sub("", str) | |
def scrap(site, window, num): | |
def text(str): | |
length = 100 | |
window.hline(num, 35, " ", length) | |
window.addnstr(num, 35, filter_text(str), length) | |
window.refresh() | |
text("downloading") | |
r = requests.get("http://" + site, timeout=2) | |
text("parsing") | |
soup = BeautifulSoup(r.content, "html.parser") | |
directory = "d/" + site | |
os.mkdir(directory) | |
for script in soup.find_all("script"): | |
src = script.get("src") | |
if src: | |
f = open(directory + "/" + str(uuid.uuid4()) + "src", "wb") | |
src = urlparse.urljoin(r.url, src) | |
text("loading " + src) | |
try: | |
r = requests.get(src, timeout=2) | |
f.write(r.text.encode("utf-8")) | |
except Exception as e: | |
text(str(e)) | |
else: | |
text("saving inline") | |
f = open(directory + "/" + str(uuid.uuid4()) + "inline", "wb") | |
f.write(script.text.encode("utf-8")) | |
class Scrapper(threading.Thread): | |
def __init__(self, sites, num, window): | |
threading.Thread.__init__(self) | |
self.sites = sites | |
self.num = num | |
self.window = window | |
def run(self): | |
for (index, site) in enumerate(self.sites): | |
self.window.hline(self.num, 0, " ", 70) | |
self.window.addnstr(self.num, 0, "{num}/{index:4}: {site}".format(num=self.num, index=index, site=site), 30) | |
self.window.refresh() | |
try: | |
scrap(site, self.window, self.num) | |
except Exception as e: | |
self.window.hline(self.num, 35, " ", 35) | |
self.window.addnstr(self.num, 35, str(e), 35) | |
self.window.refresh() | |
def partition(lst, n): | |
division = len(lst) / float(n) | |
return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n) ] | |
os.mkdir("d") | |
window = curses.initscr() | |
curses.noecho() | |
curses.curs_set(0) | |
num = 0 | |
for sites in partition(sites, 10): | |
thread = Scrapper(sites, num, window) | |
thread.start() | |
num += 1 | |
while True: | |
try: | |
char = window.getch() | |
except KeyboardInterrupt: | |
curses.endwin() | |
os._exit(1) | |
if char == ord("q"): | |
curses.endwin() | |
os._exit(1) | |
curses.endwin() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment