Created
January 8, 2015 14:40
-
-
Save nucular/a48224bc40f312fc1988 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
# python3 chanrip.py --help | |
""" | |
Chanrip | |
======= | |
Rips a thread from 4chan or 8chan, downloading all posted files and saving all | |
replies to a HTML file. Optionally supports monitoring changes while downloading | |
new replies automatically and thumbnail downloading. | |
TODO: | |
- De-spaghettify and cleanup (it's a real mess currently) | |
- Make the chans/imageboards separate classes inheriting from an abstract class | |
- Themes for the output html? | |
- Support more imageboards | |
LICENSE: | |
The MIT License (MIT) | |
Copyright (c) 2014/2015 nucular | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
import os | |
import sys | |
import time, datetime | |
import re | |
import threading | |
import urllib.request | |
import urllib.error | |
import argparse | |
import json | |
CHANS = { | |
"4chan": { | |
"url": re.compile(r"boards\.4chan\.org\/(\w+)\/thread\/(\d+)"), | |
"board": 0, | |
"thread": 1, | |
"api": "https://a.4cdn.org/{board}/thread/{thread}.json", | |
"file": "https://i.4cdn.org/{board}/{tim}{ext}", | |
"thumb": "https://0.t.4cdn.org/{board}/{tim}s.jpg", | |
"filename": "{tim}-{filename}{ext}", | |
"thumbname": "{tim}-{filename}{thext}", | |
"gifthumbs": False | |
}, | |
"8chan": { | |
"url": re.compile(r"8chan\.co\/(\w+)\/res\/(\d+)\.html"), | |
"board": 0, | |
"thread": 1, | |
"api": "https://8chan.co/{board}/res/{thread}.json", | |
"file": "https://media.8chan.co/{board}/src/{tim}{ext}", | |
"thumb": "https://media.8chan.co/{board}/thumb/{tim}{thext}", | |
"filename": "{tim}-{filename}{ext}", | |
"thumbname": "{tim}-{filename}{thext}", | |
"gifthumbs": True | |
} | |
} | |
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" | |
CSS_TOMORROW = """ | |
body { | |
background: #1d1f21 none; | |
color: #C5C8C6; | |
font-family: arial,helvetica,sans-serif; | |
font-size: 10pt; | |
} | |
a:link, a:visited { | |
color: #81a2be; | |
text-decoration: underline; | |
} | |
a:link:hover { | |
color: #5F89AC; | |
} | |
span.deadlink { | |
color: #f22; | |
text-decoration: line-through; | |
} | |
header { | |
margin: 1em 0; | |
text-align: center; | |
} | |
header h1 { | |
font-family: tahoma; | |
letter-spacing: -2px; | |
font-size: 20pt; | |
margin: 0; | |
} | |
header div.subtitle { | |
font-size: 8pt; | |
} | |
header div.subtitle a { | |
color: #C5C8C6; | |
text-decoration: none; | |
} | |
header div.subtitle a:hover { | |
text-decoration: underline; | |
} | |
hr { | |
border: 0; | |
border-top: 1px solid #282a2e; | |
height: 1px; | |
clear: left: | |
} | |
p.fileinfo { | |
display: block; | |
margin: 0 0 0 20px; | |
} | |
.post { | |
max-width: 80%; | |
background: | |
} | |
.file { | |
float: left; | |
margin-right: 2px; | |
width: 210px; | |
} | |
.file:not(.multifile) { | |
float: none; | |
} | |
.unimportant, .unimportant * { | |
font-size: 10px; | |
} | |
a .post-image { | |
float: left; | |
padding: 5px; | |
margin: 0 20px 0 0; | |
max-width: 98%; | |
width: auto; | |
height: auto; | |
max-height: 200px; | |
max-width: 200px; | |
} | |
a .full-image { | |
position: absolute; | |
left: 5px; | |
padding: 5px; | |
margin: 0 20px 0 0; | |
max-width: 98%; max-width: calc(100% - 20px); | |
} | |
div.post.op { | |
margin-right: 20px; | |
margin-bottom: 5px; | |
} | |
div.post { | |
padding-left: 20px; | |
clear: both; | |
} | |
div.post-hover { | |
position: absolute; | |
margin: 0 !important; | |
box-shadow: 0px 3px 10px rgba(0,0,0,0.5); | |
} | |
p.intro { | |
clear: none; | |
margin: 0.5em 0; | |
padding: 0; | |
padding-bottom: 0.2em; | |
} | |
p.intro span.subject { | |
color: #b294bb; | |
font-weight: bold; | |
} | |
p.intro span.name { | |
color: #C5C8C6; | |
font-weight: bold; | |
} | |
p.intro span.post_no a { | |
color: #C5C8C6; | |
margin: 0; | |
} | |
p.intro a { | |
text-decoration: none; | |
} | |
div.post div.body { | |
clear: both; | |
word-wrap: break-word; | |
white-space: pre-wrap; | |
} | |
div.post.reply { | |
display: inline-block; | |
background-color: #282a2e; | |
border: 1px solid #282a2e; | |
margin-bottom: 2px; | |
margin-left: 16px; | |
margin-top: 2px; | |
max-width: 94%; max-width: calc(100% - 16px); | |
padding: 0.2em 0.3em 0.5em 0.6em; | |
} | |
div.post.reply p { | |
margin: 0.3em 0 0 0; | |
} | |
div.post.reply div.body { | |
margin-left: 1.8em; | |
margin-top: 0.8em; | |
padding-right: 3em; | |
padding-bottom: 0.3em; | |
} | |
span.quote { | |
color: #adbd68; | |
} | |
""" | |
JS_MAIN = """ | |
String.prototype.endsWith = function(suffix) { | |
return this.indexOf(suffix, this.length - suffix.length) !== -1; | |
}; | |
$(".post-image-link").bind("click", function(e) { | |
if (e.which == 2) | |
return; | |
e.preventDefault(); | |
var thumb = $(this).find(".post-image"); | |
if ($(this).hasClass("expanded")) { | |
$(this).find(".full-image").remove(); | |
$(this).removeClass("expanded"); | |
$(window).scrollTop($(this).attr("data-scrolltop")); | |
} else { | |
var href = $(this).attr("href"); | |
if (href.endsWith(".mp4") || href.endsWith(".webm")) | |
$("<video src=\\"" + $(this).attr("href") + "\\" class=\\"full-image\\" autoplay controls></video>").appendTo(this); | |
else if (href.endsWith(".swf")) | |
$("<object src=\\"" + $(this).attr("href") + "\\" class=\\"full-image\\"></object>").appendTo(this); | |
else | |
$("<img src=\\"" + $(this).attr("href") + "\\" class=\\"full-image\\">").appendTo(this); | |
$(this).addClass("expanded"); | |
$(this).attr("data-scrolltop", $(window).scrollTop()); | |
} | |
}); | |
$(".post-reply,.quotelink").bind("mouseenter", function(e) { | |
var p = $($(this).attr("href")); | |
if (p) { | |
p.clone().appendTo("body") | |
.addClass("post-hover") | |
.css($(this).position()) | |
.bind("mouseleave", function(e) { | |
$(this).remove(); | |
}); | |
} | |
}); | |
""" | |
HTML_MAIN = """<!-- Thread ripped from {url} using ChanRip --> | |
<html> | |
<head> | |
<style>{css}</style> | |
</head> | |
<body> | |
<header> | |
<h1>/{board}/</h1> | |
<div class="subtitle"><a href="{url}">{url}</a></div> | |
</header> | |
<hr/> | |
<div class="thread"> | |
{thread} | |
</div> | |
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script> | |
<script> | |
{js} | |
</script> | |
</body> | |
</html> | |
""" | |
HTML_IMAGE = """ | |
<p class="fileinfo">File <a href="{path}">{name}</a> | |
<span class="unimportant">({size}, {w}x{h}, {origname})</span> | |
</p> | |
<a href="{path}" target="_blank" class="post-image-link"> | |
<img class="post-image" src="{thumbpath}"> | |
</a> | |
""" | |
HTML_OP = """ | |
<div class="post op" id="p{no}"> | |
<p class="intro"> | |
<span class="subject">{subject}</span> | |
<span class="name">{name} {uid}</span> | |
<time datetime="{date}">{date}</time> | |
<span class="post_no">No.<a href="#p{no}">{no}</a></span> | |
<span class="post_replies">{replies}</span> | |
</p> | |
<div class="body">{body}</div> | |
</div> | |
""" | |
HTML_REPLY = """ | |
<div class="post reply" id="p{no}"> | |
<p class="intro"> | |
<span class="name">{name} {uid}</span> | |
<time datetime="{date}">{date}</time> | |
<span class="post_no">No.<a href="#p{no}">{no}</a></span> | |
<span class="post_replies">{replies}</span> | |
</p> | |
<div class="files"> | |
{files} | |
</div> | |
<div class="body">{body}</div> | |
</div> | |
""" | |
def sizeof_fmt(num, suffix='B'): | |
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: | |
if abs(num) < 1024.0: | |
return "%3.1f%s%s" % (num, unit, suffix) | |
num /= 1024.0 | |
return "%.1f%s%s" % (num, 'Yi', suffix) | |
def chunker(res, chunksize=4096, hook=None, store=False): | |
totalbytes = (res.info().get("Content-Length") or "0").strip() | |
totalbytes = int(totalbytes) | |
bytesread = 0 | |
if store: | |
data = bytes() | |
try: | |
while True: | |
chunk = res.read(chunksize) | |
bytesread += len(chunk) | |
if not chunk: | |
break | |
if store: | |
data += chunk | |
if hook: | |
if totalbytes == 0: | |
percent = "?" | |
else: | |
percent = int(bytesread/totalbytes*100) | |
hook(chunk, bytesread, totalbytes, percent) | |
except KeyboardInterrupt: | |
return True | |
if store: | |
return data | |
class UserAgentOpener(urllib.request.FancyURLopener): | |
def __init__(self, useragent, *args, **kwargs): | |
self.version = useragent | |
class Ripper(object): | |
def __init__(self, url, directory, verbose=False, useragent=USER_AGENT, thumbs=True): | |
self.verbose = verbose | |
self.useragent = useragent | |
self.thumbs = thumbs | |
self.thread = None | |
self.ripped = [] | |
self.postcount = 0 | |
if not os.path.isdir(directory): | |
os.makedirs(directory) | |
if thumbs and not os.path.isdir(os.path.join(directory, "_thumbs")): | |
os.makedirs(os.path.join(directory, "_thumbs")) | |
self.directory = directory | |
self.chan = None | |
m = None | |
for i in CHANS.keys(): | |
m = CHANS[i]["url"].search(url) | |
if m: | |
self.chan = CHANS[i] | |
break | |
if not self.chan: | |
raise NotImplementedError("{} is not supported (yet)".format(url)) | |
self.url = url | |
self.board = m.groups()[self.chan["board"]] | |
self.thread = m.groups()[self.chan["thread"]] | |
self.apiurl = self.chan["api"] | |
self.apiurl = self.apiurl.format(board=self.board, thread=self.thread) | |
print("API: " + self.apiurl) | |
def fetch(self): | |
def hook(chunk, read, total, percent): | |
sys.stdout.write("\rFetching thread... {}/{} {}%".format(read, total, percent)) | |
req = urllib.request.Request(self.apiurl) | |
req.add_header("User-Agent", self.useragent) | |
try: | |
res = urllib.request.urlopen(req) | |
except urllib.error.URLError as e: | |
print(e) | |
return True | |
data = chunker(res, chunksize=128, hook=hook, store=True) | |
print("") | |
self.thread = json.loads(data.decode("utf-8")) | |
def downloadFile(self, name, path, url, no=1, of=1): | |
req = urllib.request.Request(url) | |
req.add_header("User-Agent", self.useragent) | |
try: | |
res = urllib.request.urlopen(req) | |
except urllib.error.URLError as e: | |
print(e) | |
return | |
with open(path, "wb") as s: | |
def hook(chunk, read, total, percent): | |
sys.stdout.write("\r{}/{} {}... {}/{} {}%".format(no, of, name, read, total, percent)) | |
s.write(chunk) | |
interrupted = chunker(res, hook=hook, chunksize=16384) | |
if interrupted: | |
print("\nInterrupted by user") | |
os.remove(path) | |
return True | |
print("") | |
def writeThread(self): | |
print("Writing thread to index.html...") | |
def figureThumbPath(v): | |
if self.thumbs: | |
v["thext"] = (self.chan["gifthumbs"] and v["ext"] == ".gif") and ".gif" or ".jpg" | |
return os.path.join("_thumbs", self.chan["thumbname"].format(**v)) | |
else: | |
return self.chan["filename"].format(**v) | |
op = self.thread["posts"][0] | |
thread = "<div class=\"files\">" | |
if "extra_files" in op: | |
thread += "\n<div class=\"file multifile\">" | |
else: | |
thread += "\n<div class=\"file\">" | |
path = self.chan["filename"].format(**op) | |
thread += HTML_IMAGE.format( | |
path=path, | |
thumbpath=figureThumbPath(op), | |
name=str(op["tim"]) + op["ext"], | |
size=sizeof_fmt(op["fsize"]), | |
w=op["w"], h=op["h"], | |
origname=op["filename"] + op["ext"] | |
) | |
thread += "</div>" | |
if "extra_files" in op: | |
for i in op["extra_files"]: | |
thread += "\n<div class=\"file multifile\">" | |
path = self.chan["filename"].format(**i) | |
thread += HTML_IMAGE.format( | |
path=path, | |
thumbpath=figureThumbPath(i), | |
name=str(i["tim"]) + i["ext"], | |
size=sizeof_fmt(i["fsize"]), | |
w=i["w"], h=i["h"], | |
origname=i["filename"] + i["ext"] | |
) | |
thread += "</div>" | |
thread += "</div>" | |
thread += HTML_OP.format( | |
subject="sub" in op and op["sub"] or "", | |
name=op["name"], | |
uid="id" in op and ("(ID: "+op["id"]+")") or "", | |
date=datetime.datetime.fromtimestamp(op["time"]).strftime("%m/%d/%y (%a) %H:%M:%S"), | |
no=op["no"], | |
body="com" in op and op["com"] or "", | |
replies=" ".join(["<a class=\"post-reply\" href=\"#p{no}\">>>{no}</a>".format(no=i) for i in op["replies"]]) | |
) | |
for p in self.thread["posts"][1:]: | |
if "tim" in p: | |
files = "<div class=\"files\">" | |
if "extra_files" in p: | |
files += "\n<div class=\"file multifile\">" | |
else: | |
files += "\n<div class=\"file\">" | |
path = self.chan["filename"].format(**p) | |
files += HTML_IMAGE.format( | |
path=path, | |
thumbpath=figureThumbPath(p), | |
name=str(p["tim"]) + p["ext"], | |
size=sizeof_fmt(p["fsize"]), | |
w=p["w"], h=p["h"], | |
origname=p["filename"] + p["ext"] | |
) | |
files += "</div>" | |
if "extra_files" in p: | |
for i in p["extra_files"]: | |
files += "\n<div class=\"file multifile\">" | |
path = self.chan["filename"].format(**i) | |
files += HTML_IMAGE.format( | |
path=path, | |
thumbpath=figureThumbPath(i), | |
name=str(i["tim"]) + i["ext"], | |
size=sizeof_fmt(i["fsize"]), | |
w=i["w"], h=i["h"], | |
origname=i["filename"] + i["ext"] | |
) | |
files += "</div>" | |
files += "</div>" | |
else: | |
files = "" | |
thread += HTML_REPLY.format( | |
files=files, | |
name=p["name"], | |
uid="id" in p and ("(ID: "+p["id"]+")") or "", | |
date=datetime.datetime.fromtimestamp(p["time"]).strftime("%m/%d/%y (%a) %H:%M:%S"), | |
no=p["no"], | |
body="com" in p and p["com"] or "", | |
replies=" ".join(["<a class=\"post-reply\" href=\"#p{no}\">>>{no}</a>".format(no=i) for i in p["replies"]]) | |
) | |
thread += "<br/>" | |
with open(os.path.join(self.directory, "index.html"), "wt") as s: | |
s.write(HTML_MAIN.format( | |
url=self.url, | |
css=CSS_TOMORROW, | |
js=JS_MAIN, | |
board=self.board, | |
thread=thread | |
)) | |
def rip(self): | |
firstrip = not self.thread | |
if self.fetch(): | |
return | |
postcount = len(self.thread["posts"]) | |
if postcount <= self.postcount: | |
print("No new posts") | |
return | |
files = [] | |
thumbs = [] | |
for i,v in enumerate(self.thread["posts"]): | |
# Map all replies | |
if (not "replies" in v) or type(v["replies"]) != list: | |
v["replies"] = [] | |
for i2,v2 in enumerate(self.thread["posts"][i:]): | |
if ("com" in v2) and (">>{}".format(v["no"]) in v2["com"]): | |
if not v2["no"] in v["replies"]: | |
v["replies"].append(v2["no"]) | |
# Fix some reply links | |
if "com" in v: | |
v["com"] = re.sub("href=\"\\/\\w+\\/res\\/\\d+\\.html\\#(\\d+)\"", "href=\"#p\\1\" class=\"quotelink\"", v["com"]) | |
if i < self.postcount: | |
continue | |
v.update({"board": self.board, "thread": self.thread}) | |
if "tim" in v: | |
name = self.chan["filename"].format(**v) | |
path = os.path.join(self.directory, name) | |
url = self.chan["file"].format(**v) | |
if not os.path.exists(path): | |
files.append([name, path, url]) | |
if self.thumbs: | |
v["thext"] = (self.chan["gifthumbs"] and v["ext"] == ".gif") and ".gif" or ".jpg" | |
name = self.chan["thumbname"].format(**v) | |
path = os.path.join(self.directory, "_thumbs", name) | |
url = self.chan["thumb"].format(**v) | |
if not os.path.exists(path): | |
thumbs.append([name, path, url]) | |
if "extra_files" in v: | |
for v2 in v["extra_files"]: | |
v2.update({"board": self.board, "thread": self.thread}) | |
name = self.chan["filename"].format(**v2) | |
path = os.path.join(self.directory, name) | |
url = self.chan["file"].format(**v2) | |
if not os.path.exists(path): | |
files.append([name, path, url]) | |
if self.thumbs: | |
v2["thext"] = (self.chan["gifthumbs"] and v["ext"] == ".gif") and ".gif" or ".jpg" | |
name = self.chan["thumbname"].format(**v2) | |
path = os.path.join(self.directory, "_thumbs", name) | |
url = self.chan["thumb"].format(**v2) | |
if not os.path.exists(path): | |
thumbs.append([name, path, url]) | |
print("{} not indexed post(s), downloading {} file(s)".format(postcount - self.postcount, len(files))) | |
for i,v in enumerate(files): | |
interrupted = self.downloadFile(*v, no=i+1, of=len(files)) | |
if interrupted: | |
break | |
if self.thumbs and len(thumbs) > 0: | |
print("Downloading {} thumbnail(s)".format(len(thumbs))) | |
for i,v in enumerate(thumbs): | |
interrupted = self.downloadFile(*v, no=i+1, of=len(thumbs)) | |
if interrupted: | |
break | |
self.postcount = postcount | |
self.writeThread() | |
def monitor(self, delay): | |
timer = delay | |
cleartimer = " " * (len(str(timer)) * 2 + 15) | |
while True: | |
while timer > 0: | |
sys.stdout.write("\r" + cleartimer) | |
sys.stdout.write("\rMonitoring... {}/{}".format(timer, delay)) | |
time.sleep(1) | |
timer -= 1 | |
timer = delay | |
print("") | |
self.rip() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="Thread ripper for image boards implementing the 4chan/8chan\ | |
APIs with optional monitoring." | |
) | |
parser.add_argument("URL", help="a link to the board thread") | |
parser.add_argument("DIR", help="a folder to rip to") | |
parser.add_argument("-m", "--monitor", action="store_true", help="keep running and regularly check thread for changes") | |
parser.add_argument("-d", "--delay", type=int, metavar="SEC", default=20, help="delay between monitoring checks, defaults to 20") | |
parser.add_argument("-v", "--verbose", action="store_true", help="show some debug logging") | |
parser.add_argument("-u", "--useragent", default=USER_AGENT, help="the user agent to use on requests") | |
parser.add_argument("--nothumbs", action="store_true", default=False, help="don't download thumbs with the files") | |
args = parser.parse_args() | |
ripper = Ripper(args.URL, args.DIR, verbose=args.verbose, useragent=args.useragent, thumbs=not args.nothumbs) | |
ripper.rip() | |
if args.monitor: | |
try: | |
ripper.monitor(args.delay) | |
except KeyboardInterrupt: | |
pass | |
print("\nDone!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment