Last active
September 25, 2015 20:27
-
-
Save ashenfad/979783 to your computer and use it in GitHub Desktop.
Streaming (one-pass) technique for sampling with replacement
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; For a fully fleshed out library, see: | |
;; https://github.com/bigmlcom/sampling | |
;; ------------------------------------- | |
;; see http://ashenfad.blogspot.com/2011/06/single-pass-sampling-with-replacement.html | |
(ns sample.replacement | |
(:use [clojure.contrib.math :only [expt]])) | |
(defn- choose [a b] | |
(let [numerator (apply * (range (- (inc a) b) (inc a))) | |
denominator (apply * (range 1 (inc b)))] | |
(/ numerator denominator))) | |
(defn- find-occurance-prob [population-size sample-size occurances] | |
(let [single-select-prob (double (/ 1 population-size)) | |
single-unselect-prob (- 1 single-select-prob) | |
occurance-prob (expt single-select-prob occurances) | |
non-occurances (- sample-size occurances) | |
non-occurance-prob (expt single-unselect-prob non-occurances) | |
combinations (choose sample-size occurances) | |
probability (* occurance-prob non-occurance-prob combinations)] | |
probability)) | |
(defn- normalize-dist [dist-seq] | |
(let [sum (apply + (map second dist-seq))] | |
(map (fn [x] [(first x) (/ (second x) sum)]) dist-seq))) | |
(defn- occurance-dist [population-size sample-size] | |
(loop [cd-value 0 | |
distribution []] | |
(if (< cd-value 0.999999) | |
(let [occurances (count distribution) | |
occurance-prob (find-occurance-prob | |
population-size | |
sample-size | |
occurances)] | |
(recur | |
(+ cd-value occurance-prob) | |
(conj distribution [occurances occurance-prob]))) | |
(normalize-dist distribution)))) | |
(defn occurance-map [population-size sample-size] | |
(let [init-dist-seq (occurance-dist population-size sample-size)] | |
(loop [sum 0 | |
dist-seq init-dist-seq | |
dist-map (sorted-map)] | |
(let [current-occurance (first dist-seq) | |
rest-dist-seq (rest dist-seq) | |
occurance-count (first current-occurance) | |
occurance-sum (+ sum (second current-occurance)) | |
new-dist-map (assoc dist-map occurance-sum occurance-count)] | |
(if (seq rest-dist-seq) | |
(recur occurance-sum rest-dist-seq new-dist-map) | |
new-dist-map))))) | |
(defn- roll-occurances [dist-map] | |
(second (first (subseq dist-map >= (rand))))) | |
(defn- sample-occurances [dist-map datum] | |
(take (roll-occurances dist-map) (repeat datum))) | |
(defn sample-with-replacement-dist [dist-map data] | |
(mapcat | |
(fn [datum] (sample-occurances dist-map datum)) | |
data)) | |
(defn sample-with-replacement [population-size sample-size data] | |
(sample-with-replacement-dist | |
(occurance-map population-size sample-size) | |
(take population-size data))) | |
; example: | |
;(def population-size 100000) | |
;(def sample-size 10000) | |
;(def data (take population-size (repeatedly (fn [] (rand-int 10000))))) | |
;(def sample (sample-with-replacement population-size sample-size data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment