WangYihang · September 21, 2020 11:01
diff --git a/SeeBugSpider.py b/SeeBugSpider.py
 # -*- coding: utf-8 -*-

 import requests
 import os
 import uuid
 import threading
 import time
 import glob

 from queue import Queue
 from bs4 import BeautifulSoup

 jobs = Queue()
 WORKER_NUMBER = 32

 def worker(ID):
    while True:
        job = jobs.get()
        print("[*] Jobs: {}".format(jobs.qsize()))
        JID = job["id"]
        print("[+] Worker[{}] downloading paper: {}".format(ID, JID))
        fetch(JID)

 def download(url, target):
    print("[+] Downloading {} => {}".format(url, target))
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "zh,en-US;q=0.9,en;q=0.8,zh-CN;q=0.7",
        "cache-control": "max-age=0",
        "dnt": "1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
    }
    response = requests.get(url, headers=headers)
    content = response.content
    print("[*] Content length: {}".format(len(content)))
    with open(target, "wb") as f:
        f.write(content)
    return content

 def fetch(ID):
    url = "https://paper.seebug.org/{}/".format(ID)
    print("{}".format(url))
    paper_path = "papers{}{}".format(os.path.sep, ID)
    image_path = "{}{}images".format(paper_path, os.path.sep)
    index_path = "{}{}index.html".format(paper_path, os.path.sep)
    # Make folders
    try:
        os.mkdir(paper_path)
        os.mkdir(image_path)
    except Exception as e:
        print(repr(e))
    # Download index.html
    if os.path.exists(index_path):
        content = open(index_path, "rb").read()
    else:
        content = download(url, index_path)
    # Parse images
    soup = BeautifulSoup(content, "html.parser")
    title = soup.title.string
    print("[*] Title: {}".format(title))
    images = soup.find_all("img")
    for image in images:
        src = image['src']
        if src.startswith("https://") or src.startswith("http://"):
            filename = uuid.uuid4()
            file_path = ".{}{}{}{}".format(os.path.sep, image_path, os.path.sep, filename)
            # Download images
            download(src, file_path)
            # Replace image src of index.html
            content = content.replace(bytes(src, encoding="utf-8"), bytes(".{}images{}{}".format(os.path.sep, os.path.sep, filename), encoding="utf-8"))
    # Update content
    with open(index_path, "wb") as f:
        f.write(content)


 def dispatcher(start, end):
    for i in range(start, end+1):
        print(i)
        try:
            print("[+] Generating job: {}".format(i))
            job = {
                "id": i,
            }
            jobs.put(job)
            print("[+] Generated {}".format(job))
        except Exception as e:
            print(repr(e))

 def start_workers():
    for i in range(WORKER_NUMBER):
        t = threading.Thread(target=worker, args=(i,))
        t.daemon = True
        t.start()

 def download():
    try:
        os.mkdir("papers")
    except Exception as e:
        print(repr(e))

    # Start dispatcher
    dispatcher(1, 1340)

    # Start works
    start_workers()

    while True:
        size = jobs.qsize()
        print("Waiting jobs finish, {} jobs remain".format(size))
        if size == 0:
            print("All job finished")
            break
        time.sleep(5)

 def index():
    papers = [int(i.split("papers{}".format(os.path.sep))[1]) for i in glob.glob("papers{}*".format(os.path.sep))]
    papers.sort()
    index_content = ""
    index_content += "<html><body>"
    i = 0
    for paper in papers[::-1]:
        i += 1
        index_path = "papers{}{}{}index.html".format(os.path.sep, paper, os.path.sep)
        content = open(index_path, "rb").read()
        soup = BeautifulSoup(content, "html.parser")
        title = soup.title.string
        print("[*] Parsing: {}".format(title))
        index_content += "<b>{}</b><a href='{}'>{}</a></br>\n".format(i, index_path, title)
    index_content += "</body></html>"
    with open("index.html", "w", encoding="utf-8") as f:
        f.write(index_content)

 download()
 index()
	# -- coding: utf-8 --

	import requests
	import os
	import uuid
	import threading
	import time
	import glob

	from queue import Queue
	from bs4 import BeautifulSoup

	jobs = Queue()
	WORKER_NUMBER = 32

	def worker(ID):
	while True:
	job = jobs.get()
	print("[*] Jobs: {}".format(jobs.qsize()))
	JID = job["id"]
	print("[+] Worker[{}] downloading paper: {}".format(ID, JID))
	fetch(JID)

	def download(url, target):
	print("[+] Downloading {} => {}".format(url, target))
	headers = {
	"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",
	"accept-encoding": "gzip, deflate, br",
	"accept-language": "zh,en-US;q=0.9,en;q=0.8,zh-CN;q=0.7",
	"cache-control": "max-age=0",
	"dnt": "1",
	"upgrade-insecure-requests": "1",
	"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
	}
	response = requests.get(url, headers=headers)
	content = response.content
	print("[*] Content length: {}".format(len(content)))
	with open(target, "wb") as f:
	f.write(content)
	return content

	def fetch(ID):
	url = "https://paper.seebug.org/{}/".format(ID)
	print("{}".format(url))
	paper_path = "papers{}{}".format(os.path.sep, ID)
	image_path = "{}{}images".format(paper_path, os.path.sep)
	index_path = "{}{}index.html".format(paper_path, os.path.sep)
	# Make folders
	try:
	os.mkdir(paper_path)
	os.mkdir(image_path)
	except Exception as e:
	print(repr(e))
	# Download index.html
	if os.path.exists(index_path):
	content = open(index_path, "rb").read()
	else:
	content = download(url, index_path)
	# Parse images
	soup = BeautifulSoup(content, "html.parser")
	title = soup.title.string
	print("[*] Title: {}".format(title))
	images = soup.find_all("img")
	for image in images:
	src = image['src']
	if src.startswith("https://") or src.startswith("http://"):
	filename = uuid.uuid4()
	file_path = ".{}{}{}{}".format(os.path.sep, image_path, os.path.sep, filename)
	# Download images
	download(src, file_path)
	# Replace image src of index.html
	content = content.replace(bytes(src, encoding="utf-8"), bytes(".{}images{}{}".format(os.path.sep, os.path.sep, filename), encoding="utf-8"))
	# Update content
	with open(index_path, "wb") as f:
	f.write(content)


	def dispatcher(start, end):
	for i in range(start, end+1):
	print(i)
	try:
	print("[+] Generating job: {}".format(i))
	job = {
	"id": i,
	}
	jobs.put(job)
	print("[+] Generated {}".format(job))
	except Exception as e:
	print(repr(e))

	def start_workers():
	for i in range(WORKER_NUMBER):
	t = threading.Thread(target=worker, args=(i,))
	t.daemon = True
	t.start()

	def download():
	try:
	os.mkdir("papers")
	except Exception as e:
	print(repr(e))

	# Start dispatcher
	dispatcher(1, 1340)

	# Start works
	start_workers()

	while True:
	size = jobs.qsize()
	print("Waiting jobs finish, {} jobs remain".format(size))
	if size == 0:
	print("All job finished")
	break
	time.sleep(5)

	def index():
	papers = [int(i.split("papers{}".format(os.path.sep))[1]) for i in glob.glob("papers{}*".format(os.path.sep))]
	papers.sort()
	index_content = ""
	index_content += "<html><body>"
	i = 0
	for paper in papers[::-1]:
	i += 1
	index_path = "papers{}{}{}index.html".format(os.path.sep, paper, os.path.sep)
	content = open(index_path, "rb").read()
	soup = BeautifulSoup(content, "html.parser")
	title = soup.title.string
	print("[*] Parsing: {}".format(title))
	index_content += "<b>{}</b><a href='{}'>{}</a></br>\n".format(i, index_path, title)
	index_content += "</body></html>"
	with open("index.html", "w", encoding="utf-8") as f:
	f.write(index_content)

	download()
	index()