jimklo · January 21, 2023 20:34
diff --git a/environment.yml b/environment.yml
 name: imgextract
 channels:
  - conda-forge
  - defaults
 dependencies:
  - bzip2=1.0.8
  - ca-certificates=2022.12.7
  - geckodriver=0.32.0
  - libcxx=14.0.6
  - libffi=3.4.2
  - libsqlite=3.40.0
  - libzlib=1.2.13
  - ncurses=6.3
  - openssl=3.0.7
  - pip=22.3.1
  - python=3.11.0
  - python-chromedriver-binary=2.42.0
  - readline=8.1.2
  - setuptools=66.1.0
  - tk=8.6.12
  - tzdata=2022g
  - wheel=0.38.4
  - xz=5.2.6
  - pip:
      - async-generator==1.10
      - attrs==22.2.0
      - certifi==2022.12.7
      - charset-normalizer==3.0.1
      - h11==0.14.0
      - idna==3.4
      - outcome==1.2.0
      - pysocks==1.7.1
      - requests==2.28.2
      - selenium==4.7.2
      - sniffio==1.3.0
      - sortedcontainers==2.4.0
      - trio==0.22.0
      - trio-websocket==0.9.2
      - urllib3==1.26.14
      - wsproto==1.2.0
 prefix: /usr/local/Caskroom/miniconda/base/envs/imgextract
diff --git a/extract.py b/extract.py

 from dataclasses import dataclass
 import re
 import unicodedata
 import requests
 from time import sleep
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from pathlib import Path


 @dataclass
 class GalleryImg():
  alt: str
  src: str 

 def get_images(uri: str):
  browser = webdriver.Firefox()
  browser.get(uri)
  sleep(5)
  elems = browser.find_elements(By.TAG_NAME, 'img')
  images = []
  for e in elems:
    e_src = e.get_attribute('src')
    if e_src.startswith("https://cdn.midjourney.com/") and e_src.endswith(".webp"):
      alt_text = e.get_attribute('alt')
      images.append(GalleryImg(alt_text, e_src))
  browser.close()
  return images

 def get_image_name(img: GalleryImg):
  root = re.sub(r",.*$", "", img.alt)
  root = slugify(root)
  

  m = re.match(r"\.([^\.]+)$", img.src)
  if m: 
    suffix = m.groups[1]
  else:
    suffix = "webp"

  if len(root) + len(suffix) >= 254:
    max_len = 253 - len(suffix)
    root = root[0:max_len]

  return f"{root}.{suffix}"

 def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')


 def download_img(img: GalleryImg, tgt: Path = Path("output")):
  if not tgt.exists():
    tgt.mkdir(parents=True)

  resp = requests.get(img.src, stream=True)
  if resp.status_code == 200:
    filename = tgt / get_image_name(img)
    filename.parent.mkdir(parents=True, exist_ok=True)
    with filename.open('wb') as wh:
        wh.write(resp.raw.read())


 if __name__ == "__main__":

  gallery_uri = "https://midjourney.com/showcase/recent/?fbclid=IwAR3WriA1xPrYwEoF4CGvl98yN-ZieNLRkhk2hz05ia5yqrxQYbTtMi2p0e4"
  img_list = get_images(gallery_uri)
  for img in img_list:
    download_img(img)
	name: imgextract
	channels:
	- conda-forge
	- defaults
	dependencies:
	- bzip2=1.0.8
	- ca-certificates=2022.12.7
	- geckodriver=0.32.0
	- libcxx=14.0.6
	- libffi=3.4.2
	- libsqlite=3.40.0
	- libzlib=1.2.13
	- ncurses=6.3
	- openssl=3.0.7
	- pip=22.3.1
	- python=3.11.0
	- python-chromedriver-binary=2.42.0
	- readline=8.1.2
	- setuptools=66.1.0
	- tk=8.6.12
	- tzdata=2022g
	- wheel=0.38.4
	- xz=5.2.6
	- pip:
	- async-generator==1.10
	- attrs==22.2.0
	- certifi==2022.12.7
	- charset-normalizer==3.0.1
	- h11==0.14.0
	- idna==3.4
	- outcome==1.2.0
	- pysocks==1.7.1
	- requests==2.28.2
	- selenium==4.7.2
	- sniffio==1.3.0
	- sortedcontainers==2.4.0
	- trio==0.22.0
	- trio-websocket==0.9.2
	- urllib3==1.26.14
	- wsproto==1.2.0
	prefix: /usr/local/Caskroom/miniconda/base/envs/imgextract

	from dataclasses import dataclass
	import re
	import unicodedata
	import requests
	from time import sleep
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from pathlib import Path


	@dataclass
	class GalleryImg():
	alt: str
	src: str

	def get_images(uri: str):
	browser = webdriver.Firefox()
	browser.get(uri)
	sleep(5)
	elems = browser.find_elements(By.TAG_NAME, 'img')
	images = []
	for e in elems:
	e_src = e.get_attribute('src')
	if e_src.startswith("https://cdn.midjourney.com/") and e_src.endswith(".webp"):
	alt_text = e.get_attribute('alt')
	images.append(GalleryImg(alt_text, e_src))
	browser.close()
	return images

	def get_image_name(img: GalleryImg):
	root = re.sub(r",.*$", "", img.alt)
	root = slugify(root)


	m = re.match(r"\.([^\.]+)$", img.src)
	if m:
	suffix = m.groups[1]
	else:
	suffix = "webp"

	if len(root) + len(suffix) >= 254:
	max_len = 253 - len(suffix)
	root = root[0:max_len]

	return f"{root}.{suffix}"

	def slugify(value, allow_unicode=False):
	"""
	Taken from https://github.com/django/django/blob/master/django/utils/text.py
	Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
	dashes to single dashes. Remove characters that aren't alphanumerics,
	underscores, or hyphens. Convert to lowercase. Also strip leading and
	trailing whitespace, dashes, and underscores.
	"""
	value = str(value)
	if allow_unicode:
	value = unicodedata.normalize('NFKC', value)
	else:
	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
	value = re.sub(r'[^\w\s-]', '', value.lower())
	return re.sub(r'[-\s]+', '-', value).strip('-_')


	def download_img(img: GalleryImg, tgt: Path = Path("output")):
	if not tgt.exists():
	tgt.mkdir(parents=True)

	resp = requests.get(img.src, stream=True)
	if resp.status_code == 200:
	filename = tgt / get_image_name(img)
	filename.parent.mkdir(parents=True, exist_ok=True)
	with filename.open('wb') as wh:
	wh.write(resp.raw.read())


	if __name__ == "__main__":

	gallery_uri = "https://midjourney.com/showcase/recent/?fbclid=IwAR3WriA1xPrYwEoF4CGvl98yN-ZieNLRkhk2hz05ia5yqrxQYbTtMi2p0e4"
	img_list = get_images(gallery_uri)
	for img in img_list:
	download_img(img)