Created
April 26, 2021 13:46
-
-
Save Xorcerer/2f20c0f278e1f397fae8b1816c1ebb68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import os | |
import re | |
import socket | |
import socks | |
import sys | |
import urllib.request | |
from pyppeteer import launch | |
from pyppeteer.errors import TimeoutError | |
PROXY_HOST = 'localhost' | |
PROXY_PORT = 7777 | |
DEST_ROOT = r'D:\Books\Comics' | |
title_re = re.compile('<h1>(.*?)</h1>', re.IGNORECASE) | |
image_re = re.compile('Large_cgurl\[\d+\] = "(.*?)"') | |
def crawl_comic(content): | |
socks.set_default_proxy(socks.SOCKS5, PROXY_HOST, PROXY_PORT) | |
socket.socket = socks.socksocket | |
m = title_re.search(content) | |
print(m.group(1)) | |
dirname = os.path.join(DEST_ROOT, m.group(1)) | |
if not os.path.exists(dirname): | |
os.mkdir(dirname) | |
for img in image_re.finditer(content): | |
print('downloading:', img.group(1)) | |
url = img.group(1) | |
urllib.request.urlretrieve(url, os.path.join(dirname, url.split('/')[-1])) | |
async def download(url): | |
browser = await launch({'args': ['--proxy-server=%s:%d' % (PROXY_HOST, PROXY_PORT)], 'headless': True }) | |
page = await browser.newPage() | |
async def get_html(): | |
i = 20 | |
while i > 0: | |
try: | |
await page.goto(url, waitUntil='domcontentloaded') | |
return await page.content() | |
except TimeoutError: | |
print('timeout while loading:', url) | |
i -= 1 | |
content = await get_html() | |
crawl_comic(content) | |
await browser.close() | |
def main(): | |
if len(sys.argv) > 1: | |
url = sys.argv[1] | |
else: | |
url = None | |
while not url: | |
url = input('url:') | |
asyncio.get_event_loop().run_until_complete(download(url)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment