Created
October 18, 2012 21:41
-
-
Save morenoh149/3914917 to your computer and use it in GitHub Desktop.
How do you write a ECDF function for an arbitrary dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env/Rscript | |
squareCDF <- function(n=100){ | |
# create random data | |
user_id <- seq(1,n) | |
payment_id <- seq(101,n+100) | |
payment_amount <- rnorm(n, mean= 100, sd=50) | |
is_card_present <- sample(0:1, n, replace=T) | |
data <- data.frame(user_id, payment_id, payment_amount, is_card_present) | |
# calculate ECDF | |
less100 <- subset(data, payment_amount < 100) | |
more100 <- subset(data, payment_amount > 100) | |
sortedless <- sort(less100$is_card_present) | |
sortedmore <- sort(more100$is_card_present) | |
less_nrow <- nrow(less100) | |
more_nrow <- nrow(more100) | |
ecdfless <- (1:less_nrow)/less_nrow | |
ecdfmore <- (1:more_nrow)/more_nrow | |
# function for returning area left of percentile | |
cp <- function(percentile, dataPassed){ | |
n <- length(dataPassed) | |
numerator <- length(dataPassed[dataPassed == 0]) | |
if (percentile * .01 < 1){ | |
return(numerator/n) | |
} else { | |
return(1) | |
} | |
} | |
# print ECDFs in text format | |
print("Users who processed less than $100") | |
print("percentile % cp") | |
for (i in 1:100){ | |
cat(sprintf("\"%d\" \"%f\"\n",i, cp(i,sortedless))) | |
} | |
print("Users who processed more than $100") | |
print("percentile % cp") | |
for (i in 1:100){ | |
cat(sprintf("\"%d\" \"%f\"\n",i, cp(i,sortedmore))) | |
} | |
# Plot ECDFs in R | |
#par(mfrow=c(1,2)) | |
#plot(sortedless, ecdfless, type="n") | |
#lines(sortedless, ecdfless) | |
#plot(sortedmore, ecdfmore, type="n") | |
#lines(sortedmore, ecdfmore) | |
#plot(ecdf(sortedless)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment