Created
September 14, 2010 12:35
-
-
Save yuroyoro/578968 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import scala.io.{Codec, Source} | |
| import scala.util.matching.Regex | |
| import java.io._ | |
| import java.net.URL | |
| object HtmlScraper { | |
| def getSource(url: String ) = { | |
| val in = new URL(url).openStream | |
| val buf = Stream.continually{ in.read }.takeWhile{ -1 != }.map{ _.byteValue}.toArray | |
| implicit val codec = { | |
| val src = Source.fromBytes(buf,"ISO-8859-1") | |
| val Charset = """.*charset\s*=\s*([0-9a-z|\-|_]+).*""".r | |
| src.getLines.collect { | |
| case Charset(cs) => cs | |
| }.toTraversable.headOption.map{ cs => Codec(cs) }.getOrElse{Codec.default } | |
| } | |
| Source.fromBytes(buf) | |
| } | |
| def write( src:Iterator[String], fileName:String ):Unit = { | |
| var bw: BufferedWriter = null | |
| try{ | |
| bw = new BufferedWriter(new FileWriter(fileName)) | |
| bw.write( src.mkString(System.getProperty("line.separator"))) | |
| } finally { | |
| if(bw != null) bw.close | |
| } | |
| } | |
| def download(url:String, fileName:String, toParse:Boolean = true ):Unit = { | |
| val src = getSource(url).getLines | |
| if( toParse ) parseAndWrite( src, fileName) else write(src, fileName ) | |
| } | |
| def parseAndWrite(src:Iterator[String], fileName:String ):Unit = | |
| write( parse(src), fileName ) | |
| def parse(src: Iterator[String]) = { | |
| def removeTag(target: String): String = { | |
| val regex = new Regex("""<.+?>|\t""") | |
| regex.replaceAllIn(target, "") | |
| } | |
| src.map{ removeTag }.filter{ _.nonEmpty } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment