Skip to content

Instantly share code, notes, and snippets.

@yuroyoro
Created May 13, 2010 07:51
Show Gist options
  • Save yuroyoro/399607 to your computer and use it in GitHub Desktop.
Save yuroyoro/399607 to your computer and use it in GitHub Desktop.
object YumaoCrawler extends Application{
import scala.io.Source
import java.io.{Writer, OutputStreamWriter, FileOutputStream }
val tagr = """<("[^"]*"|'[^']*'|[^'">])*>""".r
val sepr1 = """\d+\s*:以下、名無しにかわりましてVIPがお送りします.*""".r
val sepr2 = """\d+\s*:以下、VIPにかわりましてパー速民がお送りします.*""".r
val title = """<title>(.*)</title>""".r
val h1 = """<h1 id="header2">(.*)</h1>""".r
val end = """INDEX.*""".r
val sep = "\n%s\n\n::\n" format( "-" * 40 )
( 1 to 13 ) foreach { n =>
val text = Source.fromURL( new java.net.URL(
"http://maouyusya2828.web.fc2.com/matome%02d.html" format n )
).mkString.lines
val content = text map {
_.trim
} collect {
case title( s ) => s + "\n" + ("=" * s.length * 2 )
case h1( s ) => s
case s => tagr.replaceAllIn( s,"" ).trim match{
case sepr1() => sep
case sepr2() => sep
case s => if( s.trim.isEmpty ) s.trim else " " + s.trim
}
} dropWhile {
_.trim.isEmpty
} takeWhile {
s => end.findFirstIn(s).isEmpty
} mkString( "\n" )
val os = new OutputStreamWriter(
new FileOutputStream( "./yumao%02d.rst" format n ), "UTF-8")
os.write ( content.replaceAll("\n{3,}", "\n\n"))
os.flush
os.close
println( "%02d end." format n )
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment