Skip to content

Instantly share code, notes, and snippets.

@yuroyoro
Created September 17, 2010 12:28
Show Gist options
  • Save yuroyoro/584154 to your computer and use it in GitHub Desktop.
Save yuroyoro/584154 to your computer and use it in GitHub Desktop.
import scala.io.{Codec, Source}
import scala.util.matching.Regex
import java.io._
import java.net.URL
trait ScrapedHtml {
val src:Iterable[String]
def write( fileName:String ):Unit = {
import scala.util.control.Exception._
allCatch.opt{
new BufferedWriter(new FileWriter(fileName))
}.foreach{ bw =>
allCatch.andFinally{ bw.close } {
bw.write( src.mkString(System.getProperty("line.separator")))
}
}
}
def parse:ScrapedHtml
}
case class RawHtml(src:Iterable[String]) extends ScrapedHtml {
def parse =
ParsedHtml(src.map{ _.replaceAll("""<.+?>|\t""", "") }.filter{ _.nonEmpty })
}
case class ParsedHtml(src:Iterable[String]) extends ScrapedHtml{
def parse:ScrapedHtml = this
}
object HtmlScraper {
def apply(url:String):ScrapedHtml = RawHtml(getSource(url).getLines.toSeq)
def getSource(url: String ) = {
val in = new URL(url).openStream
val buf = Stream.continually{ in.read }.takeWhile{ -1 != }.map{ _.byteValue}.toArray
implicit val codec = {
val Charset = """.*content.*charset\s*=\s*([0-9a-z|\-|_]+).*""".r
val pf:PartialFunction[String, Codec] = { case Charset(cs) => cs }
Source.fromBytes(buf,"ISO-8859-1").getLines.find{
pf.isDefinedAt }.collect{ pf }.getOrElse{ Codec.default }
}
Source.fromBytes(buf)
}
def download(url:String, fileName:String, toParse:Boolean = true ):Unit = {
val html = if(toParse) HtmlScraper(url).parse else HtmlScraper(url)
html.write(fileName)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment