joinr · February 18, 2016 02:49 · thebusby · Feb 18, 2016
diff --git a/iotaproblem.clj b/iotaproblem.clj
 (ns sometest 
  (:require [iota :as iota]
 	    [clojure.core.reducers :as r]))

 ;;==Synthetic data==
 (def fillrecord {:Unit      :text
                 :category :text
                 :DemandGroup :text
                 :SRC :text
                 :FillType :text
                 :FollowOn :boolean
                 :name :text
                 :Component :text
                 :operation :text
                 :start :int
                 :DeploymentID :int
                 :duration :int
                 :dwell-plot? :boolean
                 :DwellYearsBeforeDeploy :float
                 :DeployDate :text
                 :FollowOnCount :int
                 :AtomicPolicy :text
                 :Category :text
                 :DeployInterval :int
                 :fill-type :text
                 :FillPath :text
                 :Period :text
                 :unitid :int
                 :deltat :int
                 :Demand :text
                 :PathLength :int
                 :OITitle :text
                 :BogBudget :int
                 :CycleTime :int
                 :DeploymentCount :int
                 :DemandType :text
                 :quantity :int
                 :end :int
                 :FillCount :int
                 :Location :text
                 :location :text
                 :compo :text
                 :DwellBeforeDeploy :int
                 :Policy :text
                 :sampled :boolean
                 })

 (defn fld->val [fld t]
  (case t
    :text (str (gensym fld))
    :boolean (rand-nth [true false])
    :int    (rand-int 50000)
    :float  (* (rand) 50000)))
 ;;generate random records from the schema.
 (defn fake-record
  []
  (reduce-kv (fn [acc k t]
               (assoc acc k
                      (fld->val k t)))
             fillrecord
             fillrecord))
 ;;Using the preceding functions, we dump 4.395605X10^6 records to
 ;;a file at.
 (def testfile "c:/path/to/bigfile.txt")
 ;;dumping elided for brevity...end result is a ~2GB tabdelimited
 ;;text file with headers in the first line.
 (defn records->file [path] ...)


 ;;Testfile has these characteristics:
 ;;4.3x10^6 newline-separated lines of
 ;;490 utf-8 Chars /line
 ;;40 tab-delimited fields (although it doesn't matter for this).
 ;;2GB in size.

 ;;==Simple Test - Count the lines of the file==

 ;;Simply increment an int, occasionally printing out
 ;;every 100000th line for status.
 (defn count-lines [r]
  (reduce (fn [acc _]
            (do (when (zero? (rem acc 100000))
                  (println acc))
                (unchecked-inc acc)))
          0
          r))
 ;;Time how long it takes us to traverse and count the file.
 ;;overhead from printing is infintessimal, so this should
 ;;simply walk the file(seq).
 (defn test  [path & {:keys [n]}]
  (time (count-lines (if (iota/seq path n)
                       (iota/seq path)))))

 ;;==specs==
 ;;Machine (laptop) is underspecced:
 ;;Intel Core2 T7200 @ 2.00 ghz
 ;;2GB Ram (this hurts :( )
 ;;Windowd7 64bit, on 64 bit JVM
 ;;NOT running -server in jvm opts (clients can't have jdk currently).

 ;;==results==
 ;;This chokes entirely...doesn't even get to 0 within minutes
 (test testfile)
 ;;I saw the default buffer size in FileSeq is pretty big,
 ;;so
 (test testfile :n (* 4 1024))
 ;;Cooks along nicely, right up until about 3x10^6, then
 ;;starts chugging.  Looks like GC goes through the roof, although
 ;;we never grow the heap....(never go above 490mb or so, despite
 ;;using -Xmx1g)

 ;;run completes in 291443 ms

 ;;visualvm shows most of the time is spent grinding in MMap/get
 ;;(understandable from glancing at the source).

 ;;GC behavior is really wierd to me though, can't help but think it'd
 ;;be fine if iota would take advantage of the available heap.  Maybe
 ;;this has something to do with filechannel? We get a jagged garbage
 ;;collection trail, hovering under 500MB without ever growing the heap.


 ;;==Alternatives==
 ;;--FastUtil
 ;;In contrast, using it.unimi.dsi.fastutil.io.FastBufferedReader
 ;;from FastUtil library, wrapped in a clojure compatible reducer,
 ;;I churn through the same file pretty easily.
 ;;It's a little slower out the gates (in terms of visible printing/traversal)
 ;;BUT it's consistent, and completes in 63145 ms.
 ;;Notable differences: FastBufferedReader is unsynchronized (dunno if this
 ;;affects the aforementioned observations).  I'd have to line-by-line the
 ;;source to see any other differences.  FastBufferedReader is not using
 ;;file channels by default.

 ;;We get a nice saw-tooth GC collection profile, peaking at max usage of
 ;;~583mb.  Heap is grown to 715mb.

 ;;--MMap
 ;;Also, I implemented an mmap version based off Eric Rochester's example at:
 ;;http://www.ericrochester.com/pages/code/parallel-io-with-mmap/
 ;;This version is a bit slower than the FastBufferedReader, but it works.
 ;;It ues nio/mmap and a (* 10 1024 1024) buffer.  Job completes in 84472ms.
 ;;Expands the heap during processing beyond 500mb to 779mb.
 ;;We get a higher-frequency gc wave, with peak usage at 668mb.

 ;;Both the mmap and fastutil implementations are actually returning strings
 ;;as part of their .readLine implementation, so some of the gc is probably
 ;;due to string creation.

 ;;==Summary==
 ;;Both alternative implementations finish the job much faster than iota/seq.
 ;;Both actually take advantage of the available heap (although they don't
 ;;come near exhausting it.  They are also coercing lines to strings in the
 ;;process.

 ;;Why is iota/seq sucking wind on this?
	(ns sometest
	(:require [iota :as iota]
	[clojure.core.reducers :as r]))

	;;==Synthetic data==
	(def fillrecord {:Unit :text
	:category :text
	:DemandGroup :text
	:SRC :text
	:FillType :text
	:FollowOn :boolean
	:name :text
	:Component :text
	:operation :text
	:start :int
	:DeploymentID :int
	:duration :int
	:dwell-plot? :boolean
	:DwellYearsBeforeDeploy :float
	:DeployDate :text
	:FollowOnCount :int
	:AtomicPolicy :text
	:Category :text
	:DeployInterval :int
	:fill-type :text
	:FillPath :text
	:Period :text
	:unitid :int
	:deltat :int
	:Demand :text
	:PathLength :int
	:OITitle :text
	:BogBudget :int
	:CycleTime :int
	:DeploymentCount :int
	:DemandType :text
	:quantity :int
	:end :int
	:FillCount :int
	:Location :text
	:location :text
	:compo :text
	:DwellBeforeDeploy :int
	:Policy :text
	:sampled :boolean
	})

	(defn fld->val [fld t]
	(case t
	:text (str (gensym fld))
	:boolean (rand-nth [true false])
	:int (rand-int 50000)
	:float (* (rand) 50000)))
	;;generate random records from the schema.
	(defn fake-record
	[]
	(reduce-kv (fn [acc k t]
	(assoc acc k
	(fld->val k t)))
	fillrecord
	fillrecord))
	;;Using the preceding functions, we dump 4.395605X10^6 records to
	;;a file at.
	(def testfile "c:/path/to/bigfile.txt")
	;;dumping elided for brevity...end result is a ~2GB tabdelimited
	;;text file with headers in the first line.
	(defn records->file [path] ...)


	;;Testfile has these characteristics:
	;;4.3x10^6 newline-separated lines of
	;;490 utf-8 Chars /line
	;;40 tab-delimited fields (although it doesn't matter for this).
	;;2GB in size.

	;;==Simple Test - Count the lines of the file==

	;;Simply increment an int, occasionally printing out
	;;every 100000th line for status.
	(defn count-lines [r]
	(reduce (fn [acc _]
	(do (when (zero? (rem acc 100000))
	(println acc))
	(unchecked-inc acc)))
	0
	r))
	;;Time how long it takes us to traverse and count the file.
	;;overhead from printing is infintessimal, so this should
	;;simply walk the file(seq).
	(defn test [path & {:keys [n]}]
	(time (count-lines (if (iota/seq path n)
	(iota/seq path)))))

	;;==specs==
	;;Machine (laptop) is underspecced:
	;;Intel Core2 T7200 @ 2.00 ghz
	;;2GB Ram (this hurts :( )
	;;Windowd7 64bit, on 64 bit JVM
	;;NOT running -server in jvm opts (clients can't have jdk currently).

	;;==results==
	;;This chokes entirely...doesn't even get to 0 within minutes
	(test testfile)
	;;I saw the default buffer size in FileSeq is pretty big,
	;;so
	(test testfile :n (* 4 1024))
	;;Cooks along nicely, right up until about 3x10^6, then
	;;starts chugging. Looks like GC goes through the roof, although
	;;we never grow the heap....(never go above 490mb or so, despite
	;;using -Xmx1g)

	;;run completes in 291443 ms

	;;visualvm shows most of the time is spent grinding in MMap/get
	;;(understandable from glancing at the source).

	;;GC behavior is really wierd to me though, can't help but think it'd
	;;be fine if iota would take advantage of the available heap. Maybe
	;;this has something to do with filechannel? We get a jagged garbage
	;;collection trail, hovering under 500MB without ever growing the heap.


	;;==Alternatives==
	;;--FastUtil
	;;In contrast, using it.unimi.dsi.fastutil.io.FastBufferedReader
	;;from FastUtil library, wrapped in a clojure compatible reducer,
	;;I churn through the same file pretty easily.
	;;It's a little slower out the gates (in terms of visible printing/traversal)
	;;BUT it's consistent, and completes in 63145 ms.
	;;Notable differences: FastBufferedReader is unsynchronized (dunno if this
	;;affects the aforementioned observations). I'd have to line-by-line the
	;;source to see any other differences. FastBufferedReader is not using
	;;file channels by default.

	;;We get a nice saw-tooth GC collection profile, peaking at max usage of
	;;~583mb. Heap is grown to 715mb.

	;;--MMap
	;;Also, I implemented an mmap version based off Eric Rochester's example at:
	;;http://www.ericrochester.com/pages/code/parallel-io-with-mmap/
	;;This version is a bit slower than the FastBufferedReader, but it works.
	;;It ues nio/mmap and a (* 10 1024 1024) buffer. Job completes in 84472ms.
	;;Expands the heap during processing beyond 500mb to 779mb.
	;;We get a higher-frequency gc wave, with peak usage at 668mb.

	;;Both the mmap and fastutil implementations are actually returning strings
	;;as part of their .readLine implementation, so some of the gc is probably
	;;due to string creation.

	;;==Summary==
	;;Both alternative implementations finish the job much faster than iota/seq.
	;;Both actually take advantage of the available heap (although they don't
	;;come near exhausting it. They are also coercing lines to strings in the
	;;process.

	;;Why is iota/seq sucking wind on this?