Last active
January 21, 2023 20:34
-
-
Save jimklo/8b0883e6b3d9553f2115d2df3e858f46 to your computer and use it in GitHub Desktop.
Gallery Image Download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: imgextract | |
channels: | |
- conda-forge | |
- defaults | |
dependencies: | |
- bzip2=1.0.8 | |
- ca-certificates=2022.12.7 | |
- geckodriver=0.32.0 | |
- libcxx=14.0.6 | |
- libffi=3.4.2 | |
- libsqlite=3.40.0 | |
- libzlib=1.2.13 | |
- ncurses=6.3 | |
- openssl=3.0.7 | |
- pip=22.3.1 | |
- python=3.11.0 | |
- python-chromedriver-binary=2.42.0 | |
- readline=8.1.2 | |
- setuptools=66.1.0 | |
- tk=8.6.12 | |
- tzdata=2022g | |
- wheel=0.38.4 | |
- xz=5.2.6 | |
- pip: | |
- async-generator==1.10 | |
- attrs==22.2.0 | |
- certifi==2022.12.7 | |
- charset-normalizer==3.0.1 | |
- h11==0.14.0 | |
- idna==3.4 | |
- outcome==1.2.0 | |
- pysocks==1.7.1 | |
- requests==2.28.2 | |
- selenium==4.7.2 | |
- sniffio==1.3.0 | |
- sortedcontainers==2.4.0 | |
- trio==0.22.0 | |
- trio-websocket==0.9.2 | |
- urllib3==1.26.14 | |
- wsproto==1.2.0 | |
prefix: /usr/local/Caskroom/miniconda/base/envs/imgextract |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
import re | |
import unicodedata | |
import requests | |
from time import sleep | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from pathlib import Path | |
@dataclass | |
class GalleryImg(): | |
alt: str | |
src: str | |
def get_images(uri: str): | |
browser = webdriver.Firefox() | |
browser.get(uri) | |
sleep(5) | |
elems = browser.find_elements(By.TAG_NAME, 'img') | |
images = [] | |
for e in elems: | |
e_src = e.get_attribute('src') | |
if e_src.startswith("https://cdn.midjourney.com/") and e_src.endswith(".webp"): | |
alt_text = e.get_attribute('alt') | |
images.append(GalleryImg(alt_text, e_src)) | |
browser.close() | |
return images | |
def get_image_name(img: GalleryImg): | |
root = re.sub(r",.*$", "", img.alt) | |
root = slugify(root) | |
m = re.match(r"\.([^\.]+)$", img.src) | |
if m: | |
suffix = m.groups[1] | |
else: | |
suffix = "webp" | |
if len(root) + len(suffix) >= 254: | |
max_len = 253 - len(suffix) | |
root = root[0:max_len] | |
return f"{root}.{suffix}" | |
def slugify(value, allow_unicode=False): | |
""" | |
Taken from https://github.com/django/django/blob/master/django/utils/text.py | |
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated | |
dashes to single dashes. Remove characters that aren't alphanumerics, | |
underscores, or hyphens. Convert to lowercase. Also strip leading and | |
trailing whitespace, dashes, and underscores. | |
""" | |
value = str(value) | |
if allow_unicode: | |
value = unicodedata.normalize('NFKC', value) | |
else: | |
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') | |
value = re.sub(r'[^\w\s-]', '', value.lower()) | |
return re.sub(r'[-\s]+', '-', value).strip('-_') | |
def download_img(img: GalleryImg, tgt: Path = Path("output")): | |
if not tgt.exists(): | |
tgt.mkdir(parents=True) | |
resp = requests.get(img.src, stream=True) | |
if resp.status_code == 200: | |
filename = tgt / get_image_name(img) | |
filename.parent.mkdir(parents=True, exist_ok=True) | |
with filename.open('wb') as wh: | |
wh.write(resp.raw.read()) | |
if __name__ == "__main__": | |
gallery_uri = "https://midjourney.com/showcase/recent/?fbclid=IwAR3WriA1xPrYwEoF4CGvl98yN-ZieNLRkhk2hz05ia5yqrxQYbTtMi2p0e4" | |
img_list = get_images(gallery_uri) | |
for img in img_list: | |
download_img(img) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment