Skip to content

Instantly share code, notes, and snippets.

@krrrr38
Created May 15, 2013 02:41
Show Gist options
  • Save krrrr38/5581308 to your computer and use it in GitHub Desktop.
Save krrrr38/5581308 to your computer and use it in GitHub Desktop.
Naverまとめの特定ページから元の画像を全て取得する
package com.krrrr38.net.naver
import scala.collection.JavaConversions._
import org.jsoup.Jsoup
import scalax.io.JavaConverters._
import scalax.file.Path
object NaverImageExtractor {
type ImageUrl = String
def main(args: Array[String]){
assert(args.length == 1)
val baseUrl = args(0)
val lastPage = fetchLastPage(baseUrl)
scrapeAll(baseUrl, lastPage).par.foreach(save)
}
def fetchLastPage(url: String): Int = {
val src = io.Source.fromURL(url).getLines.mkString
val doc = Jsoup.parse(src);
val lastPage = doc.select(".MdPagination03 a").last
if(lastPage == null)
1
else
Integer.parseInt(lastPage.text)
}
def scrapeAll(baseUrl: String, lastPage: Int): Seq[ImageUrl] =
(for(page <- 1 to lastPage) yield scrape(baseUrl + "?page=" + page)).flatten
def scrape(url: String): Seq[ImageUrl] = {
val src = io.Source.fromURL(url).getLines.mkString
val doc = Jsoup.parse(src);
val aTags = doc.select(".mdMTMWidget01ItemImg01View a")
aTags.map(_.attr("href")).par.map(scrapeOriginalImageUrl(_)).seq
}
def scrapeOriginalImageUrl(url: String): ImageUrl = {
val src = io.Source.fromURL(url).getLines.mkString
val doc = Jsoup.parse(src);
doc.select(".mdEndView01Img01 a").first.attr("href")
}
val imageExtensions = List(".jpg", ".png", ".gif", ".bmp")
def save(url: ImageUrl) {
val image = (new java.net.URL(url)).asInput.bytes
val filename = url.split("/").toList.last
val name =
if(imageExtensions.exists(filename.endsWith(_)))
filename
else
filename + ".jpg"
try {
Path("out", name).write(image)
} catch {
case e: Exception =>
(new java.io.File("out/" + name)).delete
}
}
}
import sbt._
import sbt.Keys._
object NaverImageExtractorBuild extends Build {
lazy val naver_image_extractor = Project(
id = "naver_image_extractor",
base = file("."),
settings = Project.defaultSettings ++ Seq(
name := "naver_image_extractor",
organization := "com.krrrr38.net.naver",
version := "0.1-SNAPSHOT",
scalaVersion := "2.10.1",
libraryDependencies ++= Seq(
"org.jsoup" % "jsoup" % "0.2.1b",
"com.github.scala-incubator.io" %% "scala-io-core" % "0.4.1" exclude("javax.transaction", "jta"),
"com.github.scala-incubator.io" %% "scala-io-file" % "0.4.1" exclude("javax.transaction", "jta")
)
)
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment