Skip to content

Instantly share code, notes, and snippets.

@WangYihang
Last active September 21, 2020 11:01
Show Gist options
  • Save WangYihang/86b09dcc4253b2f41fc547edddb87add to your computer and use it in GitHub Desktop.
Save WangYihang/86b09dcc4253b2f41fc547edddb87add to your computer and use it in GitHub Desktop.
Spider for SeeBug
# -*- coding: utf-8 -*-
import requests
import os
import uuid
import threading
import time
import glob
from queue import Queue
from bs4 import BeautifulSoup
jobs = Queue()
WORKER_NUMBER = 32
def worker(ID):
while True:
job = jobs.get()
print("[*] Jobs: {}".format(jobs.qsize()))
JID = job["id"]
print("[+] Worker[{}] downloading paper: {}".format(ID, JID))
fetch(JID)
def download(url, target):
print("[+] Downloading {} => {}".format(url, target))
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh,en-US;q=0.9,en;q=0.8,zh-CN;q=0.7",
"cache-control": "max-age=0",
"dnt": "1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
}
response = requests.get(url, headers=headers)
content = response.content
print("[*] Content length: {}".format(len(content)))
with open(target, "wb") as f:
f.write(content)
return content
def fetch(ID):
url = "https://paper.seebug.org/{}/".format(ID)
print("{}".format(url))
paper_path = "papers{}{}".format(os.path.sep, ID)
image_path = "{}{}images".format(paper_path, os.path.sep)
index_path = "{}{}index.html".format(paper_path, os.path.sep)
# Make folders
try:
os.mkdir(paper_path)
os.mkdir(image_path)
except Exception as e:
print(repr(e))
# Download index.html
if os.path.exists(index_path):
content = open(index_path, "rb").read()
else:
content = download(url, index_path)
# Parse images
soup = BeautifulSoup(content, "html.parser")
title = soup.title.string
print("[*] Title: {}".format(title))
images = soup.find_all("img")
for image in images:
src = image['src']
if src.startswith("https://") or src.startswith("http://"):
filename = uuid.uuid4()
file_path = ".{}{}{}{}".format(os.path.sep, image_path, os.path.sep, filename)
# Download images
download(src, file_path)
# Replace image src of index.html
content = content.replace(bytes(src, encoding="utf-8"), bytes(".{}images{}{}".format(os.path.sep, os.path.sep, filename), encoding="utf-8"))
# Update content
with open(index_path, "wb") as f:
f.write(content)
def dispatcher(start, end):
for i in range(start, end+1):
print(i)
try:
print("[+] Generating job: {}".format(i))
job = {
"id": i,
}
jobs.put(job)
print("[+] Generated {}".format(job))
except Exception as e:
print(repr(e))
def start_workers():
for i in range(WORKER_NUMBER):
t = threading.Thread(target=worker, args=(i,))
t.daemon = True
t.start()
def download():
try:
os.mkdir("papers")
except Exception as e:
print(repr(e))
# Start dispatcher
dispatcher(1, 1340)
# Start works
start_workers()
while True:
size = jobs.qsize()
print("Waiting jobs finish, {} jobs remain".format(size))
if size == 0:
print("All job finished")
break
time.sleep(5)
def index():
papers = [int(i.split("papers{}".format(os.path.sep))[1]) for i in glob.glob("papers{}*".format(os.path.sep))]
papers.sort()
index_content = ""
index_content += "<html><body>"
i = 0
for paper in papers[::-1]:
i += 1
index_path = "papers{}{}{}index.html".format(os.path.sep, paper, os.path.sep)
content = open(index_path, "rb").read()
soup = BeautifulSoup(content, "html.parser")
title = soup.title.string
print("[*] Parsing: {}".format(title))
index_content += "<b>{}</b><a href='{}'>{}</a></br>\n".format(i, index_path, title)
index_content += "</body></html>"
with open("index.html", "w", encoding="utf-8") as f:
f.write(index_content)
download()
index()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment