Created
May 23, 2015 09:45
-
-
Save Hajto/8b48ce689eec21c176d0 to your computer and use it in GitHub Desktop.
Scala Scrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import model.FuneralSchedule | |
import play.api.libs.json.Json | |
import scala.io.Source | |
var date = "2015-05-05" | |
val source = Source.fromURL("http://zck.krakow.pl/?pageId=16&date=" + date).mkString | |
val regex = "(?s)<table>.+?(Cmentarz.+?)<.+?</table>".r | |
var thing: List[FuneralSchedule] = List() | |
var jsonFeed: List[Funeral] = List() | |
val regMatcher = "(" | |
case class Funeral(hour: Option[String], who: Option[String], age: Option[String]) { | |
override def toString: String = { | |
"Cos" | |
} | |
} | |
implicit val format = Json.format[Funeral] | |
val out = regex.findAllIn(source).matchData foreach { table => | |
thing ::= FuneralSchedule(table.group(1), clearStrings(table.group(0))) | |
"""<tr\s?>.+?</\s?tr>""".r.findAllIn(clearStrings(table.group(0))).matchData foreach { tr => | |
//TODO: Naprawic bo szlak trafia wydajnosc | |
val a #:: b #:: c #:: _ = """<td\s?>.+?</\s?td>""".r.findAllIn(tr.group(0)).toStream | |
jsonFeed ::= Funeral(removeMarkers(a),removeMarkers(b),removeMarkers(c)) | |
} | |
println("Koniec tabeli") | |
} | |
thing | |
Json.toJson(jsonFeed) | |
println(removeMarkers(Some("<td > <td> Marian Debil </ td>"))) | |
def removeMarkers(s: Option[String]) : String= { | |
s match { | |
case Some(value) => removeMarker(value) | |
case None => " " | |
} | |
def removeMarker(s: String) = { | |
s.replaceAll( """(</?\s?td\s?>)""", "") | |
} | |
} | |
def clearStrings(s: String) = { | |
val regex = "((class=\".+?\")|(id=\".+?\")|(style=\".+?\")|(\\n))" | |
s.replaceAll(regex, "") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment