Created
September 17, 2010 12:28
-
-
Save yuroyoro/584154 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import scala.io.{Codec, Source} | |
| import scala.util.matching.Regex | |
| import java.io._ | |
| import java.net.URL | |
| trait ScrapedHtml { | |
| val src:Iterable[String] | |
| def write( fileName:String ):Unit = { | |
| import scala.util.control.Exception._ | |
| allCatch.opt{ | |
| new BufferedWriter(new FileWriter(fileName)) | |
| }.foreach{ bw => | |
| allCatch.andFinally{ bw.close } { | |
| bw.write( src.mkString(System.getProperty("line.separator"))) | |
| } | |
| } | |
| } | |
| def parse:ScrapedHtml | |
| } | |
| case class RawHtml(src:Iterable[String]) extends ScrapedHtml { | |
| def parse = | |
| ParsedHtml(src.map{ _.replaceAll("""<.+?>|\t""", "") }.filter{ _.nonEmpty }) | |
| } | |
| case class ParsedHtml(src:Iterable[String]) extends ScrapedHtml{ | |
| def parse:ScrapedHtml = this | |
| } | |
| object HtmlScraper { | |
| def apply(url:String):ScrapedHtml = RawHtml(getSource(url).getLines.toSeq) | |
| def getSource(url: String ) = { | |
| val in = new URL(url).openStream | |
| val buf = Stream.continually{ in.read }.takeWhile{ -1 != }.map{ _.byteValue}.toArray | |
| implicit val codec = { | |
| val Charset = """.*content.*charset\s*=\s*([0-9a-z|\-|_]+).*""".r | |
| val pf:PartialFunction[String, Codec] = { case Charset(cs) => cs } | |
| Source.fromBytes(buf,"ISO-8859-1").getLines.find{ | |
| pf.isDefinedAt }.collect{ pf }.getOrElse{ Codec.default } | |
| } | |
| Source.fromBytes(buf) | |
| } | |
| def download(url:String, fileName:String, toParse:Boolean = true ):Unit = { | |
| val html = if(toParse) HtmlScraper(url).parse else HtmlScraper(url) | |
| html.write(fileName) | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment