Created
August 3, 2013 10:12
-
-
Save piotrbelina/6145972 to your computer and use it in GitHub Desktop.
Scalding apache log parser for boomerang.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cascading.tuple.{Fields, TupleEntry} | |
import com.twitter.scalding._ | |
import java.net.URLDecoder | |
import scala.util.matching.Regex | |
class BoomerangLogJob(args: Args) extends Job(args) { | |
val input = TextLine(args("input")) | |
val output = TextLine(args("output")) | |
val trap = Tsv(args("trap")) | |
val inputFields = 'line | |
val regexFields = ('ip, 'time, 'method, 'event) | |
input | |
.read | |
.addTrap(trap) | |
.mapTo('line -> regexFields) { | |
te: TupleEntry => | |
val regex = new Regex("([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*).*$") | |
val split = regex.findFirstMatchIn(te.getString("line")).get.subgroups | |
(split(0), split(1), split(2), split(3)) | |
} | |
.filter('event) { event: String => event.matches("^/beacon\\.php.*") } | |
.map('event -> ('url, 'done, 'resp)) { event: String => urlParse(event) } | |
.groupBy('url) { _.size.average('done).average('resp) } | |
.groupBy('size) { _.sortBy('url).reverse.take(1000000) } | |
.write(output) | |
def urlParse(url: String) = { | |
def parseQuery = { | |
val parts = url.split("\\?") | |
val query = parts(1) | |
query.split("&").map((param: String) => { | |
val pair = param.split("=").map { | |
URLDecoder.decode(_, "UTF-8") | |
} | |
pair(0) -> pair(1) | |
}).toMap | |
} | |
val query = parseQuery | |
(query.get("u").get, query.get("t_done").get, query.get("t_resp").get) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment