Skip to content

Instantly share code, notes, and snippets.

@cjavad
Created December 21, 2019 23:11
Show Gist options
  • Save cjavad/9d1491075c30f57794d9e476d79973d5 to your computer and use it in GitHub Desktop.
Save cjavad/9d1491075c30f57794d9e476d79973d5 to your computer and use it in GitHub Desktop.
Python script and downloads manga chapters from mangadex.org, works as a executable and takes args. (Downloads to folders it creates) REQUIRES SELENIUM and matching CHROME+CHROMEDRIVER.
#!/usr/bin/env python3
import requests
import os
import sys
import argparse
'''
This function takes a string and checks if it uses the format [http(s)][://mangadex.org/chapter/]*
aka. if it's a valid link to a mangadex manga/chapter.
It returns a bool depending on the result (True/False)
'''
def check_if_valid_url(url):
if "http" in url and "://mangadex.org/chapter/" in url and not "gap" in url:
return True
else:
return False
# Using the sleep function to pause and wait for the webpage to load
from time import sleep
# Import selenium webdirver
from selenium import webdriver
# Importing multiple elements from the webdriver/common namespace
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
# string result of type(driver) added as a const to use for checking
DRIVER_TYPE = "<class 'selenium.webdriver.chrome.webdriver.WebDriver'>"
# Basic init function that creates a hidden (headless) webdriver element
def init_driver(path = "./chromedriver"):
CHROMEDRIVER_PATH = path
WINDOW_SIZE = "1920,1080"
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
return driver
# This function can take a webdriver element that has loaded a mangadex page and find the next chapter link
def nextchapter(driver):
try:
next_chapter = driver.find_element_by_class_name("chapter-link-right").get_attribute("href")
return next_chapter
except:
return False
'''
This function takes a baselink mangadex image host that uses the format *[/*0.jpg]
and increases the number to fit the amount of pages in the manga
'''
def getallpages(baselink, total_pages):
counter = 1
all_pages = []
s1 = baselink.split("/")
s2 = s1[len(s1)-1]
while counter <= int(total_pages):
s = baselink.split("/")
s.pop()
s.append(s2.replace("1", str(counter)))
all_pages.append("/".join(s))
counter += 1
return all_pages
'''
Basic loading function for a chapter page, gets all the information that you can get from a chapter and returns it like this:
{
"title":manga title,
"ctitle":chapter title,
"total_pages":total pages,
"pages":[jpg links],
"next_chapter": link to next chapter
}
'''
def load_mangadex_chapter(driver, chapter_link, delay = 2):
driver.get(chapter_link)
try:
sleep(delay/2)
alert = driver.find_element_by_class_name("message") # for gap situation
ActionChains(driver).move_to_element(alert).click().perform() #
WebDriverWait(driver, delay/2).until(expected_conditions.presence_of_element_located((By.XPATH, "html/body/div[1]/div[2]/div[2]/div/img")))
except Exception:
sleep(1) # wait for one more second
mid = chapter_link.split("/")[4]
title = driver.find_element_by_class_name("manga-link").text
ctitle = driver.find_element_by_xpath("html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/select/option[@value='{}']".format(mid)).text
total_pages = driver.find_element_by_class_name("total-pages").text
baselink = driver.find_element_by_xpath("html/body/div[1]/div[2]/div[2]/div/img").get_attribute("src")
info_sheet = {
"title":title,
"ctitle":ctitle,
"total_pages":total_pages,
"pages":getallpages(baselink, total_pages),
"next_chapter": nextchapter(driver)
}
return info_sheet
'''
This function loads an entire manga from a starting chapter to the last chapter in the series.
It returns a dict like this:
{
"title": manga title,
"total_pages": total pages in manga,
"chapters":[
{
"title": chapter title,
"pages": [jpg links]
} ...
]
}
'''
def get_whole_mangadex(driver, first_chapter, single_chapter = False):
more_chapters = True
c_chapterlink = first_chapter
info_sheet = {
"title":"",
"total_pages":0,
"chapters":[]
}
while more_chapters:
cc = load_mangadex_chapter(driver, c_chapterlink)
info_sheet["chapters"].append({
"title": cc["ctitle"],
"pages": cc["pages"]
})
info_sheet["total_pages"] += int(cc["total_pages"])
info_sheet["title"] = cc["title"]
if single_chapter:
more_chapters = False
break
if cc["next_chapter"] and "chapter" in cc["next_chapter"]:
c_chapterlink = cc["next_chapter"]
more_chapters = True
continue
else:
more_chapters = False
break
return info_sheet
'''
Downloads a jpg or raw file from a link using requests. Returns raw data.
'''
def download_link(url):
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"})
r.close()
return r.content
'''
Downloads a single mangadex chapter
'''
def download_chapter(chaptername, pages):
chaptername = "-".join(chaptername.split(" "))
try:
os.mkdir(chaptername)
except FileExistsError:
None
for j, i in enumerate(pages):
name = "page" + str(j) + ".jpg"
path = os.path.join("./" + chaptername, name)
downloaded_content = download_link(i)
with open(path, "wb") as f:
f.write(downloaded_content)
'''
Downloads an entire manga (or just a single chapter)
'''
def download_manga(first_chapter_link, single_chapter = False, driver = False):
driver = (driver if str(type(driver)) == DRIVER_TYPE else init_driver())
manga = get_whole_mangadex(driver, first_chapter_link, single_chapter)
if single_chapter:
print("Downloading 1 chapter...")
else:
print(len(manga["chapters"]), "chapters found in the manga", manga["title"])
try:
os.mkdir(manga["title"])
os.chdir(manga["title"])
except FileExistsError:
os.chdir(manga["title"])
for i, c in enumerate(manga["chapters"]):
download_chapter(c["title"], c["pages"])
print("Downloaded", " »" + c["title"] + "« ", str(i + 1) + "/" + str(len(manga["chapters"])))
print("\n")
os.chdir("..")
# argparse arguments
parser = argparse.ArgumentParser(description='Download a mangadex chapter or a whole manga.')
parser.add_argument('url', metavar='url', type=str, default=False, nargs='?', help='mangadex chapter link')
parser.add_argument('-s','--single', action='store_const', default=False, const=True, help='Use this option to only download the linked chapter')
parser.add_argument('-L', '--list', metavar='urls', default=False, nargs='*', help='Use this for multiple chapters. Combine with -s or --single to only download those chapters')
parser.add_argument('-D', '--driver', metavar='driver', default=False, help='Path for chromedriver')
# main function
if __name__ == "__main__":
global global_driver
parsed = parser.parse_args(sys.argv[1:])
# init driver
if parsed.driver:
try:
global_driver = init_driver(parsed.driver)
if str(type(global_driver)) == DRIVER_TYPE:
# driver OK.
custom_driver = True
else:
global_driver = init_driver()
except WebDriverException:
global_driver = init_driver()
else:
custom_driver = False
global_driver = init_driver()
# check which method is used and download accordingly
if parsed.url:
if check_if_valid_url(parsed.url):
download_manga(parsed.url, parsed.single, global_driver)
else:
print('Please use a valid url')
elif parsed.list:
for i in parsed.list:
if check_if_valid_url(i):
download_manga(i, parsed.single, global_driver)
else:
print('Please use valid urls')
break
else:
parser.print_help()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment