Last active
February 12, 2017 17:20
-
-
Save chocolatkey/5656df41f75143773f2c5c51c176d5c6 to your computer and use it in GitHub Desktop.
Python's slowly growing on me...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Mangatown Scraper 1.2 | |
# Henry (chocolatkey) 2017 | |
# coding=utf-8 | |
import urllib.request | |
from bs4 import BeautifulSoup | |
import re | |
import threading | |
import time | |
import logging | |
import os | |
import sys | |
import argparse | |
from fake_useragent import UserAgent | |
ua = UserAgent() | |
import traceback | |
logging.basicConfig(filename='scraper.log', filemode='w', level=logging.DEBUG) # Change to debug if needed. Be careful: current mode overwrites previous log | |
kmp = "http://www.mangatown.com/" # Muh web URL base | |
nmangas = 0 # Amount of Mangas/huas/hwas | |
nchaps = 0 # Amount of chapters | |
nlchaps = 0 # Amount of licensed chapters | |
npics = 0 # Amount of pictures (pages) | |
err1 = 0 # Lvl1 Errors: Probably timed out connection to server, not usually related to scraper itself | |
err2 = 0 # Lvl2 Errors: Shouldn't be happening, something went wrong in the chapter worker thread | |
err3 = 0 # Lvl3 Errors: Occurs when scraper can't make sense of the chapter webpage (no <div class="mangaread-main">. Actually occurs, will look into why soon. | |
def stats(nmangas, nchaps, nlchaps, npics, err1, err2, err3): # yeah I know it's kind of silly passing the vars, do what u want | |
return "--------------------\nTotal stats: \nMangas: " + str(nmangas) + \ | |
"\nChapters: " + str(nchaps) + \ | |
"\nLicensed Chapters: " + str(nlchaps) + "(Add ~" + str(nchaps*16) + " pics)" + \ | |
"\nPictures: " + str(npics) + \ | |
"\nLvl1 Errors: " + str(err1) + \ | |
"\nLvl2 Errors: " + str(err2) + \ | |
"\nLvl3 Errors: " + str(err3) | |
def urrq(url): | |
urlrq = urllib.request.Request( | |
url, | |
data=None, | |
headers={ | |
'User-Agent': ua.google | |
} | |
) | |
return BeautifulSoup(urllib.request.urlopen(urlrq).read().decode('utf-8'), 'html.parser') | |
def pagedownloader(link,lb,number): | |
global err3 | |
global npics | |
try: | |
soupinst = urrq(link + str(number) + ".html") | |
mmain = soupinst.find("img", {"id": "image"}) | |
if mmain is not None: | |
npics += 1 | |
url = soupinst.find("img", {"id": "image"}).get("src") | |
filename = str(url.split('/')[-1]).split('?')[0] | |
fileloc = lb + os.sep + filename | |
if os.path.isfile(fileloc): | |
logging.debug("Pic exists: " + filename) | |
else: | |
logging.debug("Retreiving " + filename) | |
try: | |
urllib.request.urlretrieve(url, fileloc) # save img | |
except Exception as e: | |
logging.error(str(e)) | |
time.sleep(0.25) | |
else: | |
logging.error("No image @" + link + str(number) + ".html") | |
err3 += 1 | |
except Exception as err: # Means we probably sent too many requests | |
print(link + str(number) + ".html" + " cooldown" + "\n" + str(err)) | |
time.sleep(1) # Cooldown | |
pagedownloader(link, lb,number) # Try again | |
#os._exit(1) | |
def chapworker(link, lb): # Individual chapter thread worker | |
global npics | |
global nlchaps | |
global err2 | |
global err3 | |
try: | |
soup = urrq(link) | |
pagecount = soup.find("div", {"class": "page_select"}).find("select", {"onchange": "javascript:location.href=this.value;"}).find_all("option") #pages | |
lastpage = int(pagecount[-1].text) # Get last page | |
for number in range(1, lastpage): | |
time.sleep(0.05) | |
pagedownloader(link,lb,number) | |
except Exception as e: | |
logging.error(str(e)) | |
err2 += 1 | |
time.sleep(1) | |
chapworker(link,lb) | |
def main(): | |
global kmp | |
global nmangas | |
global nchaps | |
global nlchaps | |
global npics | |
global err1 | |
global err2 | |
global err3 | |
DESCRIPTION = '''Scrape Mangatown for comics, chapters, pages and info, then download, check or add info to the collection''' | |
MODE_HELP = '''Running mode. Possible options you can combine: s (scrape), c (check), i (get info), o (overwrite). | |
Example: mangatownscraper.py --mode sio''' | |
BEGIN_HELP = '''Directory page to start at''' | |
firstnum = 1 | |
parser = argparse.ArgumentParser(description = DESCRIPTION) | |
#parser.add_argument('files', nargs = '+', help = FILES_HELP) | |
#parser.add_argument('-m','--mode', dest = 'mode', help = MODE_HELP, required=True) #TODO: work with mode | |
parser.add_argument('-b','--begin', dest = 'begin', help = BEGIN_HELP) | |
options = parser.parse_args() | |
if options.begin is not None: | |
firstnum = int(options.begin) | |
print("Starting from page " + str(firstnum)) | |
soup = urrq(kmp + "directory/") | |
pages = soup.find("div", {"class": "next-page"}).find_all(href=re.compile("\/directory\/[A-Za-z0-9-_\/\.]+"), class_=False, id=False) | |
lastnum = int(re.match(r".+\/(\d+)\.htm$", pages[-1].get('href')).group(1)) # Get last directory page | |
print(str(lastnum) + " pages to go through") | |
time.sleep(1) | |
for number in range(firstnum, lastnum): # directory pages | |
logging.info("=======================\nPage: " + str(number)) | |
soup = urrq(kmp + "directory/" + str(number) + ".htm") | |
for mng in soup.find("ul", {"class": "manga_pic_list"}).find_all("li"): #mangas | |
link = mng.find("a", {"class": "manga_cover"}) | |
mtitle = str(link.get('title')).strip().encode('utf-8') | |
mstub = str(re.match(".+\/manga\/([A-Za-z0-9-_]+)\/$", link.get('href')).group(1)) | |
logging.info(mtitle + b" (" + mstub.encode('utf-8') + b")") | |
if not os.path.exists(mstub): | |
os.makedirs(mstub) | |
print(b"=====" + mtitle + b"=====") | |
nmangas += 1 | |
try: | |
soup = urrq(link.get('href')) | |
threads = [] | |
chaptertable = soup.find("ul", {"class": "chapter_list"}).find_all("a") | |
for link in chaptertable: #chapterss | |
chaptername = " ".join(str(link.find(text=True)).replace(" ", "").split()) | |
chapternum = re.findall(re.compile(r'.+\ (\d+\.*\d*)$'), chaptername)[0] | |
logging.debug(b"\nChapter: " + chaptername.encode('utf-8') + b" (" + chapternum.encode('utf-8') + b")\n") | |
chapterpath = mstub + os.sep + chapternum | |
if not os.path.exists(chapterpath): | |
os.makedirs(chapterpath) | |
nchaps += 1 | |
#chapworker(link.get('href'),chapterpath) | |
t = threading.Thread(target=chapworker, args=(link.get('href'),chapterpath,)) | |
threads.append(t) | |
t.start() | |
time.sleep(0.1) # beeee careful! | |
else: | |
logging.info("Chapter already exists!") | |
print("Chapter already exists!") | |
for x in threads: | |
x.join() | |
except Exception as e: | |
logging.error(str(e)) | |
print(traceback.format_exc()) | |
err1 += 1 | |
time.sleep(1) | |
pass | |
print("\n" + stats(nmangas, nchaps, nlchaps, npics, err1, err2, err3)) # Print stats after scraping each manga for verbose console output | |
time.sleep(1) # Give it a rest | |
logging.info(stats(nmangas, nchaps, nlchaps, npics, err1, err2, err3)) # Log final stats | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment