Created
November 12, 2018 07:24
-
-
Save elvircrn/06b5e1770010faf88d8fc788ecf2ff0e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import urllib | |
import urllib.request | |
import os | |
import concurrent.futures | |
import zipfile | |
import shutil | |
def download_stuff(comic_url, comic_name): | |
COMIC_DOG_URL = comic_url | |
COMIC_DOG_COMIC_URL_REG = re.compile('http://striputopija.blogspot.rs/\d{4}.*?\.html') | |
COMIC_DOG_COMIC_JPG = re.compile('\/\/\d\.bp\.blogspot\.com\/.*?\d{3}.jpg') | |
base_folder = 'C:/Users/elvircrn/Documents/rccrawler/src/' + comic_name | |
if not os.path.exists(base_folder): | |
os.mkdir(base_folder) | |
def zipdir(path, ziph): | |
# ziph is zipfile handle | |
for root, dirs, files in os.walk(path): | |
for file in files: | |
ziph.write(os.path.join(root, file)) | |
def save_img(comic_page, folder, idx): | |
urllib.request.urlretrieve(comic_page, folder + '/' + str(idx) + ".jpg") | |
def get_comic(link): | |
name = comic_name + ' #' + link.split('/')[-1][:3] | |
folder = base_folder + '/' + name | |
os.mkdir(folder) | |
with urllib.request.urlopen(link) as page: | |
comic_pages = re.findall(COMIC_DOG_COMIC_JPG, page.read().decode('utf-8')) | |
if len(comic_pages) < 40: | |
return | |
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: | |
for idx, comic_page in enumerate(comic_pages): | |
if idx == 0 or (idx >= 5 and idx % 2 == 1): | |
if idx: | |
idx = int(idx / 2) - 1 | |
if comic_page[0] != 'h': | |
comic_page = 'http:' + comic_page | |
executor.submit(save_img, comic_page, folder, idx) | |
zipf = zipfile.ZipFile(folder + '.cbz', 'w', zipfile.ZIP_DEFLATED) | |
zipdir(folder, zipf) | |
shutil.rmtree(folder) | |
zipf.close() | |
with urllib.request.urlopen(COMIC_DOG_URL) as response: | |
page = response.read().decode('utf-8') | |
links = re.findall(COMIC_DOG_COMIC_URL_REG, page) | |
print(links[0]) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: | |
for link in links: | |
executor.submit(get_comic, link) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: | |
executor.submit(download_stuff, 'https://striputopija.blogspot.com/p/dylan-dog.html', 'DylanDog') | |
executor.submit(download_stuff, 'https://striputopija.blogspot.com/p/nathan-never.html', 'NathanNever') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment