Skip to content

Instantly share code, notes, and snippets.

@jsundram
Last active October 1, 2015 14:38
Show Gist options
  • Select an option

  • Save jsundram/2009477 to your computer and use it in GitHub Desktop.

Select an option

Save jsundram/2009477 to your computer and use it in GitHub Desktop.
transaction parser
(ns transparse.core
(:import [clojure.java.io])
;;(:import [java.lang.Float])
(:require [clojure.string :as string])
(:require [clj-time.format :as form])
(:gen-class))
(defn to-fields [line]
(string/split
(string/trim line) #"\\t"))
(defn parse-date [str-date]
(let [formatter (form/formatter
"yyyy-MM-dd HH:mm:ss.S")]
(form/parse formatter str-date)))
(defn process [[sender receiver timestr datestr amount currency]]
[(string/trim sender)
(string/trim receiver)
(parse-date timestr)
(java.lang.Float/parseFloat amount)
currency])
(defn -main [file-name]
(with-open [rdr (clojure.java.io/reader file-name)]
(println (count
(pmap (partial map (comp process to-fields))
(partition 128 (line-seq rdr))))))
(shutdown-agents))
import datetime
def main():
"""takes about 4 minutes for 1 day"""
# header:
# sndr_last_login_ip rcvr_last_login_ip pmt_cre_ts pmt_cre_dt pmt_usd_amt pmt_curr_code
# 202.189.72.37 124.183.171.170 2011-03-02 16:05:52.0 2011-03-02 6.0900 AUD
f = gzip.open(datafile)
header = f.readline()
data = []
start = time.time()
for i, line in enumerate(f):
if i % 100000 == 0: print i
sender, receiver, timestr, datestr, amount, currency = line.strip().split('\t')
sender, receiver = map(str.strip, [sender, receiver])
t = datetime.strptime(timestr, "%Y-%m-%d %H:%M:%S.%f")
amount = float(amount)
data.append([sender, receiver, t, amount, currency])
print "parsed %d records in %s seconds" % (i, time.time() - start)
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment