Created
June 30, 2022 11:05
-
-
Save majabojarska/311f63803325995472e71a2b651fc602 to your computer and use it in GitHub Desktop.
Downloads all comics from the Poorly Drawn Lines archive.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Downloads all comics from the Poorly Drawn Lines archive. | |
Comic images are saved in directory defined by _OUTPUT_DIR. | |
""" | |
from pathlib import Path | |
from typing import Iterable, Tuple, Optional | |
from urllib.parse import urlparse | |
import bs4 | |
import requests | |
_URL_ARCHIVE: str = "https://poorlydrawnlines.com/archive/" | |
_PARSER_HTML: str = "html.parser" | |
_OUTPUT_DIR = Path("output") | |
def _get_posts_in_archive() -> Iterable[Tuple[str, str]]: | |
res_archive = requests.get(_URL_ARCHIVE) | |
archive = bs4.BeautifulSoup(res_archive.text, _PARSER_HTML) | |
div_content: bs4.Tag = archive.find("div", class_="content page") | |
post_anchors: Iterable[bs4.Tag] = div_content.find_all("a", href=True) | |
return [(elem.text, elem.attrs["href"]) for elem in post_anchors] | |
def _get_img_url_from_post(url: str) -> str: | |
div_post = bs4.BeautifulSoup(requests.get(url).text, _PARSER_HTML) | |
img: Optional[bs4.Tag] = div_post.find("div", class_="post").find("img") | |
if not img: | |
raise ValueError("Failed to find 'img' tag in the target post.") | |
return img.attrs["src"] | |
def _get_img(url: str, path: Path) -> None: | |
with requests.get(url, stream=True) as res: | |
res.raise_for_status() | |
with open(path, "wb") as img: | |
for chunk in res.iter_content(chunk_size=8192): | |
img.write(chunk) | |
def _scrape_comics_from_archive(path_output: Path): | |
for title, url_post in _get_posts_in_archive(): | |
try: | |
url_img: str = _get_img_url_from_post(url_post) | |
except ValueError: | |
print(f"Skipping {url_post} due to missing 'img' tag.") | |
continue | |
path_img: Path = path_output / Path(urlparse(url_img).path).name | |
if path_img.exists(): | |
print(f"{path_img.name} is already downloaded, skipping.") | |
else: | |
_get_img(url=url_img, path=path_img) | |
print(path_img.name) | |
if __name__ == "__main__": | |
_OUTPUT_DIR.mkdir(exist_ok=True, parents=True) | |
_scrape_comics_from_archive(_OUTPUT_DIR) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment