Last active
May 24, 2018 06:46
-
-
Save Karasiq/11d00203e9816c76264d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.{OutputStream, PrintWriter} | |
import akka.NotUsed | |
import akka.actor.{ActorRef, Props} | |
import akka.stream.scaladsl.Source | |
import com.gargoylesoftware.htmlunit.html._ | |
import com.gargoylesoftware.htmlunit._ | |
import com.karasiq.common.{StringUtils, ThreadLocalFactory} | |
import com.karasiq.fileutils.PathUtils | |
import com.karasiq.gallerysaver.scripting.internal.{GallerySaverContext, Loaders, LoaderUtils} | |
import com.karasiq.gallerysaver.scripting.loaders.HtmlUnitGalleryLoader | |
import com.karasiq.gallerysaver.scripting.resources._ | |
import com.karasiq.networkutils.HtmlUnitUtils._ | |
import com.karasiq.networkutils.cloudflare.{CloudFlareCookieRetriever, CloudFlareUtils} | |
import com.karasiq.networkutils.downloader.{FileDownloaderActor, FileDownloaderTraits, HttpClientFileDownloader} | |
import com.karasiq.networkutils.url.URLParser | |
import scala.concurrent.Future | |
import scala.language.postfixOps | |
import scala.util.{Failure, Success, Try} | |
// Internal | |
object SosachParsers { | |
val config: com.typesafe.config.Config = LoaderUtils.config.getConfig("gallery-saver.2ch") | |
def fakeIp: Option[String] = { | |
Try(config.getString("x-forwarded-for")).toOption | |
} | |
def session: Map[String, String] = { | |
val userCodeAuth = Try(config.getString("session")) | |
val cfClearance = Try(config.getString("cf-clearance")) | |
Seq("usercode_auth" → userCodeAuth, "cf_clearance" → cfClearance) | |
.collect { | |
case (name, Success(value)) ⇒ | |
name → value | |
}.toMap | |
} | |
val cloudFlareBypass = CloudFlareCookieRetriever() | |
val cfWebClientFactory = ThreadLocalFactory.softRef[WebClient] { | |
Try(config.getString("user-agent")) match { | |
case Success(userAgent) ⇒ | |
val wc = CloudFlareUtils.compatibleWebClient() | |
wc.getBrowserVersion.setUserAgent(userAgent) | |
wc | |
case Failure(_) ⇒ | |
CloudFlareUtils.compatibleWebClient() | |
} | |
} | |
lazy val cfFileDownloader: ActorRef = { | |
import com.karasiq.networkutils.HttpClientUtils._ | |
val webClient = cfWebClientFactory() | |
val builder = defaultSettings.builder | |
.setUserAgent(webClient.getBrowserVersion.getUserAgent) | |
fakeIp.foreach { ip ⇒ | |
import scala.collection.JavaConversions._ | |
import org.apache.http.message.BasicHeader | |
builder.setDefaultHeaders(Seq(new BasicHeader("X-Forwarded-For", ip))) | |
} | |
Option(webClient.getOptions.getProxyConfig).filter(_.getProxyHost ne null) | |
.foreach(p ⇒ builder.setProxy(proxyConfigToProxy(p))) | |
val history = LoaderUtils.fdHistory | |
val converter = LoaderUtils.fdConverter | |
val props = Props(new HttpClientFileDownloader(builder) with FileDownloaderActor with history.WithHistory with converter.WithImageConverter with FileDownloaderTraits.CheckSize with FileDownloaderTraits.CheckModified) | |
LoaderUtils.actorSystem.actorOf(props, "sosachFileDownloader") | |
} | |
case class PostHeader(postId: Int, posterName: String, postTime: String, title: String, fileDescription: String) { | |
def format = s"#$postId $posterName ($postTime) - $title [$fileDescription]" | |
} | |
case class Post(text: String, header: String, images: Seq[String] = Nil) | |
case class Thread(url: String, posts: Seq[Post]) { | |
def id: Option[Long] = { | |
val regex = "/res/(\\d+).html".r | |
regex.findFirstIn(url) match { | |
case Some(regex(threadId)) ⇒ | |
Some(threadId.toLong) | |
case _ ⇒ | |
None | |
} | |
} | |
def opPost = posts.head | |
def answers = posts.tail | |
} | |
trait ThreadParser { | |
def parseThread(page: Page): Thread | |
} | |
object ThreadParser { | |
def forPage(page: Page): ThreadParser = page match { | |
case htmlPage: HtmlPage ⇒ | |
val footer = htmlPage.firstByXPath[HtmlFooter]("//p[@class='footer']|//footer[@class='footer']") | |
footer match { | |
case Some(ft) if ft.asText().contains("wakaba 3.0.8-mk2") ⇒ | |
new WakabaHtmlParser | |
case _ ⇒ | |
new MakabaHtmlParser | |
} | |
case p ⇒ | |
throw new IllegalArgumentException("No parser found for page: " + p) | |
} | |
} | |
object Thread { | |
def apply(page: Page): Thread = ThreadParser.forPage(page).parseThread(page) | |
def unapply(page: Page): Option[Thread] = { | |
val thread = apply(page) | |
if (thread.posts.isEmpty) None | |
else Some(thread) | |
} | |
} | |
/** | |
* Old 2ch.hk engine HTML parser | |
*/ | |
private final class WakabaHtmlParser extends ThreadParser { | |
def parseThread(page: Page) = page match { | |
case htmlPage: HtmlPage ⇒ Thread(page.getUrl.toString, { | |
htmlPage.byXPath[HtmlElement]("//div[@class='thread']/div[@class='oppost']|//div[@class='thread']/table[@class='post']/tbody/tr/td") | |
.map(p ⇒ { | |
val header = { | |
val h = (p \\ classOf[HtmlSpan]) | |
.flatMap(span ⇒ span.classes.map(_ -> StringUtils.htmlTrim(span.getTextContent))) | |
.toMap.withDefault(_ ⇒ "") | |
val postId = "[\\d]+".r.findFirstIn(h("reflink")).fold(0)(_.toInt) | |
PostHeader(postId, h("postername"), h("posttime"), h("filetitle"), h("filesize")) | |
} | |
val text = ((p \\ classOf[HtmlBlockQuote] *@\ "postMessage") \ classOf[HtmlParagraph]) | |
.fold("")(_.asText()) | |
val thumb = p \\ classOf[HtmlImage] *@\ "img" | |
Post(text, header.format, thumb.map(_.getParentNode).collect { | |
case a: HtmlAnchor ⇒ a.fullHref | |
} orElse thumb.map(_.fullSrc) toVector) | |
}).toIndexedSeq | |
}) | |
} | |
} | |
/** | |
* New 2ch.hk engine HTML parser | |
*/ | |
private final class MakabaHtmlParser extends ThreadParser { | |
override def parseThread(page: Page): Thread = page match { | |
case htmlPage: HtmlPage ⇒ Thread(page.getUrl.toString, { | |
htmlPage.byXPath[HtmlElement]("//form[@id='posts-form']/div[1]/div/div").map(p ⇒ { | |
val (details, text, images) = { | |
val details: Map[String, String] = (p @\ "post-details" \\ classOf[HtmlSpan]).flatMap { | |
e ⇒ e.classes.map(_ → StringUtils.htmlTrim(e.getTextContent)) | |
}.toMap.withDefaultValue("") | |
val images = (p @\ "images" \\ classOf[HtmlImage]) | |
.filterNot(_.getSrcAttribute.contains("/makaba/templates/img/webm-logo.png")) | |
.map { img ⇒ img.getParentNode match { | |
case a: HtmlAnchor ⇒ a.fullHref | |
case _ ⇒ img.fullSrc | |
} | |
} | |
val text = (p \ classOf[HtmlBlockQuote]).fold("")(e ⇒ StringUtils.htmlTrim(e.asText())) | |
(details, text, images.toIndexedSeq) | |
} | |
val header = { | |
val (postTime, postId) = { | |
val d = details("posttime-reflink").split('\n') | |
(StringUtils.htmlTrim(d(0)), "[\\d]+".r.findFirstIn(d(2)).fold(0)(_.toInt)) | |
} | |
PostHeader(postId, details("ananimas"), postTime, details("post-title"), | |
images.map(img ⇒ URLParser(img).file.name).mkString(", ")) | |
} | |
Post(text, header.format, images) | |
}).toIndexedSeq | |
}) | |
} | |
} | |
/** | |
* Creates the text representation of thread and saves it to file | |
*/ | |
case class ThreadSaver(thread: Thread, hierarchy: Seq[String] = Seq("2ch", "unsorted"), referrer: Option[String] = Some("https://2ch.hk/"), cookies: Map[String, String] = Map.empty, loader: String = "sosach-thread") extends FileGenerator { | |
private def separator = StringUtils.repeated("-", 100) | |
private def formatPost(p: Post) = { | |
s"${p.header}\n$separator\n${p.text}" | |
} | |
private def formatThread(writer: PrintWriter): Unit = { | |
val heading = s"Thread: ${thread.url}" | |
val formattedPosts = thread.posts.map(formatPost) | |
writer.println(separator) | |
writer.println(heading) | |
writer.println(separator) | |
formattedPosts.foreach { post ⇒ | |
writer.println() | |
writer.println(separator) | |
writer.println(post) | |
} | |
} | |
override def write(os: OutputStream): Unit = { | |
val writer = new PrintWriter(os) | |
formatThread(writer) | |
writer.flush() | |
} | |
/** | |
* File name | |
*/ | |
override def fileName: Option[String] = thread.id.map(id ⇒ s"thread-$id.txt") | |
/** | |
* Resource URL | |
*/ | |
override def url: String = thread.url | |
} | |
def subDirectoryFor(thread: Thread, page: HtmlPage): Option[String] = { | |
for { | |
id <- thread.id | |
title <- Some(StringUtils.htmlTrim(page.getTitleText)) if title.nonEmpty | |
} yield PathUtils.validFileName(s"$title [$id]") | |
} | |
} | |
object SosachResources { | |
def thread(url: String, hierarchy: Seq[String] = Seq("2ch", "unsorted"), referrer: Option[String] = Some("https://2ch.hk/"), cookies: Map[String, String] = SosachParsers.session): GalleryResource = { | |
GalleryResource("sosach-thread", url, referrer, cookies, hierarchy) | |
} | |
} | |
class SosachLoader extends HtmlUnitGalleryLoader { | |
import SosachParsers._ | |
private val log = LoaderUtils.log | |
override def fileDownloader: Option[ActorRef] = Some(cfFileDownloader) | |
override def webClient: WebClient = cfWebClientFactory() | |
/** | |
* Loader ID | |
*/ | |
override def id: String = "sosach-thread" | |
/** | |
* Is loader applicable to provided URL | |
* @param url URL | |
* @return Loader can load URL | |
*/ | |
override def canLoadUrl(url: String): Boolean = { | |
"""https?://m?(2ch|2-ch)\.\w{2}/\w+/res/\d+\.html?""".r.findFirstIn(url).nonEmpty | |
} | |
/** | |
* Fetches resources from URL | |
* @param url URL | |
* @return Available resource | |
*/ | |
override def load(url: String): GalleryResources = Source.single { | |
SosachResources.thread(url) | |
} | |
override protected def withResource[T <: LoadableResource](resource: LoadableResource)(f: PartialFunction[Page, Source[T, NotUsed]])(implicit ctx: GallerySaverContext): Source[T, NotUsed] = { | |
val request = new WebRequest(new java.net.URL(resource.url), HttpMethod.GET) | |
fakeIp.foreach { ip ⇒ | |
request.setAdditionalHeader("X-Forwarded-For", ip) | |
} | |
val cookies = { | |
val cm = new CookieManager | |
this.compileCookies(resource).foreach(cm.addCookie) | |
cm | |
} | |
val wc = this.webClient | |
wc.withCookies(cookies) { | |
val page: Page = wc.getPage(request) | |
val result: Source[T, akka.NotUsed] = page match { | |
case p: Page if CloudFlareUtils.isCloudFlareCaptchaPage(p) ⇒ | |
log.error("CloudFlare captcha required") | |
log.info("Request: {}", p.getWebResponse.getWebRequest) | |
Source.empty | |
case p: Page if CloudFlareUtils.isCloudFlarePage(p) ⇒ | |
log.info("Bypassing CloudFlare page: {}", p) | |
val wc = this.webClient | |
wc.addCookies(cloudFlareBypass.retrieveCookies(p.getUrl)) | |
wc.withGetHtmlPage(resource.url)(f) | |
case p: Page ⇒ | |
f(p) | |
case _ ⇒ | |
Source.empty | |
} | |
page.cleanUp() | |
result | |
} | |
} | |
/** | |
* Fetches sub resources from URL | |
* @param resource Parent resource | |
* @return Available resources | |
*/ | |
override def load(resource: LoadableResource): GalleryResources = { | |
withResource(resource) { | |
case page: HtmlPage ⇒ | |
val thread: Thread = Thread(page) | |
val cookies = extractCookies(resource) | |
val hierarchy = if (resource.hierarchy.lastOption.contains("unsorted")) { | |
resource.hierarchy.dropRight(1) :+ subDirectoryFor(thread, page).getOrElse("unsorted") | |
} else { | |
resource.hierarchy | |
} | |
val textGenerator = if (thread.posts.nonEmpty) { | |
Iterator.single(ThreadSaver(thread, hierarchy, Some(page.getUrl.toString), cookies)) | |
} else { | |
Iterator.empty | |
} | |
Source.fromIterator(() ⇒ textGenerator ++ thread.posts.iterator | |
.flatMap(_.images) | |
.map(FileResource(this.id, _, Some(page.getUrl.toString), cookies, hierarchy))) | |
} | |
} | |
} | |
Loaders.register[SosachLoader] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment