Created
December 19, 2012 16:17
-
-
Save pchalasani/4337940 to your computer and use it in GitHub Desktop.
Cascalog Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Create a map without quoting: (an interesting defmacro exercise!) | |
;; (qmap city nyc population 14000000) | |
;; => {"city" "nyc", "population" "14000000"} | |
;; (some special chars still need to be quoted though!) | |
(defmacro qmap [& args] | |
`(apply hash-map (map name (map str '~args)))) | |
;; conf settings: | |
(def my-conf | |
(qmap | |
hadoop.tmp.dir "/tmp/prasadch" | |
mapreduce.job.acl-view-job "*" | |
mapred.map.tasks.speculative.execution true | |
mapred.job.queue.name adhoc | |
mapred.min.split.size 10737418240 | |
;; I just added the below options to see if it helps speed it up. | |
mapred.reduce.tasks 1000 | |
mapred.reduce.child.java.opts -Xmx3000m | |
mapreduce.reduce.child.java.opts -Xmx3000m | |
;; mapred.reduce.tasks 100 | |
mapred.job.reduce.memory.mb 3072 | |
mapreduce.job.reduce.memory.mb 3072 | |
)) | |
;; save propterty_id -> property_name map | |
(defn prop-map [prop-file] | |
(with-job-conf my-conf | |
(??<- | |
[?site-id ?site] | |
( (hfs-textline prop-file ) :> ?line ) | |
( re-parse-with [","] ?line :> ?site-id ?site )))) | |
;; generate audience data fields from raw feed | |
(defn abf-gen [prop-file path] | |
(<- [!week !bcookie !site !sess-id !time-spent] | |
( (hfs-seqproj path :outfields ["timestamp:Long" "cookie_id" "src_pty" "type" "nw_sess_id" "time_spent:Long"] ) | |
!time !bcookie !site-id "p" !sess-id !time-spent) | |
(:distinct false) ;; allow identical output rows | |
(:trap errors) | |
( (prop-map prop-file) !site-id !site) | |
(quot !time (* 7 24 60 60) :> !week ))) | |
;; calc various site metrics | |
(defmain SiteMetricsSketch [prop-file data-path out-path] | |
(with-job-conf my-conf | |
;; (cio/with-log-level :fatal ;; suppress all log msgs except fatal | |
(?<- | |
(hfs-textline out-path) | |
[!site !week !users !views !visits !len !views-visit !len-visit !views-user !len-user] ;; agg by (site,week) | |
( (abf-gen prop-file data-path) !week !bcookie !site !sess-id !time-spent) | |
(:trap errors) | |
(c/count :> !views) | |
(ops/agg-hyperloglog !bcookie :> !u-hll) | |
(hll/cardinality !u-hll :> !users) | |
(ops/agg-hyperloglog !sess-id :> !s-hll) | |
(hll/cardinality !s-hll :> !visits) | |
(c/sum !time-spent :> !len) | |
(quot !views !users :> !views-user) | |
(quot !len !users :> !len-user) | |
(quot !views !visits :> !views-visit) | |
(quot !len !visits :> !len-visit)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for my-conf example