Skip to content

Instantly share code, notes, and snippets.

@elvircrn
Created November 12, 2018 07:24
Show Gist options
  • Save elvircrn/06b5e1770010faf88d8fc788ecf2ff0e to your computer and use it in GitHub Desktop.
Save elvircrn/06b5e1770010faf88d8fc788ecf2ff0e to your computer and use it in GitHub Desktop.
import re
import urllib
import urllib.request
import os
import concurrent.futures
import zipfile
import shutil
def download_stuff(comic_url, comic_name):
COMIC_DOG_URL = comic_url
COMIC_DOG_COMIC_URL_REG = re.compile('http://striputopija.blogspot.rs/\d{4}.*?\.html')
COMIC_DOG_COMIC_JPG = re.compile('\/\/\d\.bp\.blogspot\.com\/.*?\d{3}.jpg')
base_folder = 'C:/Users/elvircrn/Documents/rccrawler/src/' + comic_name
if not os.path.exists(base_folder):
os.mkdir(base_folder)
def zipdir(path, ziph):
# ziph is zipfile handle
for root, dirs, files in os.walk(path):
for file in files:
ziph.write(os.path.join(root, file))
def save_img(comic_page, folder, idx):
urllib.request.urlretrieve(comic_page, folder + '/' + str(idx) + ".jpg")
def get_comic(link):
name = comic_name + ' #' + link.split('/')[-1][:3]
folder = base_folder + '/' + name
os.mkdir(folder)
with urllib.request.urlopen(link) as page:
comic_pages = re.findall(COMIC_DOG_COMIC_JPG, page.read().decode('utf-8'))
if len(comic_pages) < 40:
return
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
for idx, comic_page in enumerate(comic_pages):
if idx == 0 or (idx >= 5 and idx % 2 == 1):
if idx:
idx = int(idx / 2) - 1
if comic_page[0] != 'h':
comic_page = 'http:' + comic_page
executor.submit(save_img, comic_page, folder, idx)
zipf = zipfile.ZipFile(folder + '.cbz', 'w', zipfile.ZIP_DEFLATED)
zipdir(folder, zipf)
shutil.rmtree(folder)
zipf.close()
with urllib.request.urlopen(COMIC_DOG_URL) as response:
page = response.read().decode('utf-8')
links = re.findall(COMIC_DOG_COMIC_URL_REG, page)
print(links[0])
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
for link in links:
executor.submit(get_comic, link)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
executor.submit(download_stuff, 'https://striputopija.blogspot.com/p/dylan-dog.html', 'DylanDog')
executor.submit(download_stuff, 'https://striputopija.blogspot.com/p/nathan-never.html', 'NathanNever')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment