Skip to content

Instantly share code, notes, and snippets.

@yuroyoro
Created September 14, 2010 12:35
Show Gist options
  • Save yuroyoro/578968 to your computer and use it in GitHub Desktop.
Save yuroyoro/578968 to your computer and use it in GitHub Desktop.
import scala.io.{Codec, Source}
import scala.util.matching.Regex
import java.io._
import java.net.URL
object HtmlScraper {
def getSource(url: String ) = {
val in = new URL(url).openStream
val buf = Stream.continually{ in.read }.takeWhile{ -1 != }.map{ _.byteValue}.toArray
implicit val codec = {
val src = Source.fromBytes(buf,"ISO-8859-1")
val Charset = """.*charset\s*=\s*([0-9a-z|\-|_]+).*""".r
src.getLines.collect {
case Charset(cs) => cs
}.toTraversable.headOption.map{ cs => Codec(cs) }.getOrElse{Codec.default }
}
Source.fromBytes(buf)
}
def write( src:Iterator[String], fileName:String ):Unit = {
var bw: BufferedWriter = null
try{
bw = new BufferedWriter(new FileWriter(fileName))
bw.write( src.mkString(System.getProperty("line.separator")))
} finally {
if(bw != null) bw.close
}
}
def download(url:String, fileName:String, toParse:Boolean = true ):Unit = {
val src = getSource(url).getLines
if( toParse ) parseAndWrite( src, fileName) else write(src, fileName )
}
def parseAndWrite(src:Iterator[String], fileName:String ):Unit =
write( parse(src), fileName )
def parse(src: Iterator[String]) = {
def removeTag(target: String): String = {
val regex = new Regex("""<.+?>|\t""")
regex.replaceAllIn(target, "")
}
src.map{ removeTag }.filter{ _.nonEmpty }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment