whilo · June 17, 2015 07:54
diff --git a/gistfile1.r b/gistfile1.r

 click1 <- function(){
    x <- runif(1);y <- runif(1)
    plot(x = x, y = y, xlim = c(0, 1), ylim = c(0, 1),
         main = "Bitte auf den Punkt klicken",
         xlab = '', ylab = '',
         axes = FALSE, frame.plot = TRUE)
    clicktime <- system.time(xyclick <- locator(1))
    c(timestamp = Sys.time(),
      x = x, y = y,
      xclick = xyclick$x, yclick = xyclick$y,
      tclick = clicktime[3])
 }

 # 1 click wegwerfen, 4-5 einfuehrungsclicks
 # lm modell, laufende nummer des clicks muss aufgenommen werden

 # besserer vergleich als qqplot ist linearisieren
 # vergleich gerade
 click <- function(count) {
    # 5 einfuehrungsklicks
    initClicks <- Map(function(i) {c(index=i, click1())}, 0:5)

    #clicks <- Map(function(i) { c(i, click1())}, 1:count)
    clicks <- list()
    lastClick <- tail(initClicks,1)[[1]] # truely elegant ...
    prevX <- lastClick["x"]
    prevY <- lastClick["y"]
    for (i in 1:count) {
        click <- c(index=i, click1())
        xclick <- click["xclick"]
        yclick <- click["yclick"]
        click <- c(click, dist=(click["x"] - xclick)**2 + (click["y"] - yclick)**2)
        click <- c(click, way=(xclick-prevX)**2 + (yclick - prevY)**2)
        clicks <- append(clicks, list(click))

        prevX <<- xclick
        prevY <<- yclick
    }
    clicks <- do.call(rbind.data.frame, clicks)
    colnames(clicks) <- c("index", "timestamp", "x", "y", "xclick", "yclick", "tclick", "dist", "way")
    clicks
 }

 # wie viele clicks sinnvoll fuer lm? ermuedung, test auf signifikanz?
 # rerun experiment
 write.table(click(20), "click20r")
 write.table(click(20), "click20l")

 write.table(click(20), "click20r2")
 write.table(click(20), "click20l2")

 # read stored data
 right <- rbind(read.table("click20r"), read.table("click20r2"))
 left <- rbind(read.table("click20l"), read.table("click20l2"))

 # not what is difference between left and right?

 # 0-hypothesis: are identical distributed

 # alternative
 # shift? in linear model
 # x_i = mu + e_i  i=1,...,n1=n_x
 # y_i = mu + a + e_i i=1,.., n_2=n_y
 # t-test detects shift alternative

 # alternative
 # different distribution function
 # KS test between ecdfs, continuous cdfs necessary (?)


 # appending of lists and dataframes
 stack(list(left=left$tclick, right=right$tclick))

 # idea, compare time / way, should be normal distributed? t-test for significance

 ?sample
 sample(1:12) # permutation

 wilcox.test( right$tclick, left$tclick ) # stetigkeitskorrektur

 require(coin)

 wilcox_test(right$tclick, left$tclick )



 hist(right$tclick / right$way)

 # empirical cdf, shift between left and right clearly visible
 plot(ecdf(left$tclick), col="green")
 lines(ecdf(right$tclick), col="red")
 legend(x=1,y=1,legend=c("left", "right"), fill=c("green", "red"))

 ks.test(left$tclick, right$tclick)
 # p-value: 0.001 (that dists are the same)


 # t-test:
 # empirical estimator for mean of two groups
 # simple case, both same variance, estimate variance
 t.test(left$tclick, right$tclick)

 t.test(left$tclick, right$tclick, var.equal=TRUE)
 # more degrees of freedom

 # linear model: (only if normal distributed)


 # qqplot comparison of time to click
 qqplot(right$tclick, left$tclick)

 ppplot(right$tclick, distf=rnorm)



 rlm <- lm(formula= right$tclick ~ right$way + right$index + right$dist)
 llm <- lm(formula= left$tclick ~ left$way + left$index + left$dist)

 require(graphics)

 par(mfrow = c(2,2))
 plot(rlm)

 par(mfrow = c(2,2))
 plot(llm)

 # monte carlo confidence bounds for theoretical distribution


 # qqplot 300 beobachtungen

 # genauigkeit, laenge des weges, anzahl der clicks


 unif50 <- runif(50)
 unif100 <- runif(100)
 norm50 <- rnorm(50)
 norm100 <- rnorm(100)
 lognorm50 <- exp(rnorm(50))
 lognorm100 <- exp( rnorm(100))

 oldpar <- par(mfrow = c(2, 3))
 plot(ecdf(unif50), pch = "[")
 plot(ecdf(norm50), pch = "[")
 plot(ecdf(lognorm50), pch = "[")
 plot(ecdf(unif100), pch = "[")
 plot(ecdf(norm100), pch = "[")
 plot(ecdf(lognorm100), pch = "[")
 par(oldpar)

 oldpar <- par(mfrow = c(2, 3))
 qqnorm(unif50, main = "Normal Q-Q unif50")
 qqnorm(norm50, main = "Normal Q-Q norm50")
 qqnorm(lognorm50, main = "Normal Q-Q lognorm50")
 qqnorm(unif100, main = "Normal Q-Q unif100")
 qqnorm(norm100, main = "Normal Q-Q norm100")
 qqnorm(lognorm100, main = "Normal Q-Q lognorm100")
 par(oldpar)




 tq99 <- function(n, df=1) {
    t100 <- rt(n, df)
    q <- quantile(t100, probs = seq(0.01, 0.99, 0.98))
    st100 <- sort(t100)
    ust100 <- st100[q["1%"]<st100]
    cst100 <- ust100[q["99%"]>ust100]
    cst100
 }




 # TODO why 19? Aufgabe 1.10: abstract P(T(X0) > T(Xi))
 ppplot <- function(x, y=NULL, samps = 19, distf=rnorm) {
    if(is.null(y)) {
        y <- sort(distf(length(x))) #(1:length(x))/length(x)
    } else {
        y <- sort(y)
    }
    plot(sort(x), y, xlab = substitute(x), ylab = expression(F[n]),
         main = "Verteilungsfunktion mit Monte-Carlo-Band",
         type = "s")
    mtext(paste(samps, "Monte-Carlo-Stichproben"), side = 3)
    samples <- matrix(distf(length(x)* samps), nrow = length(x), ncol = samps)
    samples <- apply(samples, 2, sort)
    envelope <- t(apply(samples, 1, range))
    lines(envelope[, 1], y, type = "s", col = "red");
    lines(envelope[, 2], y, type = "s", col = "red")
 }



 ppplot(norm100, tq99(100))

 ppplot(rnorm(300), rnorm(300))

 ks.test(rnorm(300), runif(300))














 # chisq vs. ks
 # https://stackoverflow.com/questions/21655653/r-chi-square-goodness-of-fit-for-random-numbers-generated

 set.seed(1)
 df <- data.frame(y = rexp(1000),
                 z = rcauchy(1000, 100, 100))

 ks.test(df$y,"pexp")
 # One-sample Kolmogorov-Smirnov test
 #
 # data:  df$y
 # D = 0.0387, p-value = 0.1001
 # alternative hypothesis: two-sided

 ks.test(df$z,"pcauchy",100,100)
 #  One-sample Kolmogorov-Smirnov test
 #
 # data:  df$z
 # D = 0.0296, p-value = 0.3455
 # alternative hypothesis: two-sided

 breaks <- c(seq(0,10,by=1))
 O <- table(cut(df$y,breaks=breaks))
 p <- diff(pexp(breaks))
 chisq.test(O,p=p, rescale.p=T)

 #   Chi-squared test for given probabilities
 #
 # data:  O
 # X-squared = 7.9911, df = 9, p-value = 0.535


 par(mfrow=c(1,2))
 plot(qexp(seq(0,1,0.01)),quantile(df$y,seq(0,1,0.01)),
     main="Q-Q Plot",ylab="df$Y", xlab="Exponential",
     xlim=c(0,5),ylim=c(0,5))
 plot(qcauchy(seq(0,.99,0.01),100,100),quantile(df$z,seq(0,.99,0.01)),
     main="Q-Q Plot",ylab="df$Z",xlab="Cauchy",
     xlim=c(-1000,1000),ylim=c(-1000,1000))


 # lm

 x <- 1:100
 err <- rnorm(100, mean = 0, sd = 10)
 y <- 2.5*x + err

 lm(y ~ x)

 lmres <- lm(y ~ x)
 plot(x, y)
 abline(lmres)



 attach(hills)
 scotlm<-lm(time ~ dist + climb)
 attach(scotlm)


 oldpar <- par(mfrow=c(1,2))
 plot(fitted.values, residuals,xlim=c(0,200), main="Scottish Hill Races", pch=19, cex.lab = 1.5)
 text.id(fitted.values, residuals, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
 sres<-stdres(scotlm)
 plot(fitted.values,sres,xlim=c(0,200), main="Scottish Hill Races",
 ylab="MASS::stdres", pch=19, cex.lab = 1.5)
 text.id(fitted.values,sres, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
 par(oldpar)


 oldpar <- par(mfrow=c(1,2))

 scotdffits <-dffits(scotlm)

 plot(fitted.values, scotdffits,ylab="DFFITS",xlim=c(0,200),
 main="Scottish Hill Races", pch=19, cex.lab = 1.5)
 text.id(fitted.values, scotdffits, c(7,11, 18,31,33,35), row.names(hills), cex.id=1.2)
 scotcooks <-cooks.distance(scotlm)
 plot(fitted.values, scotcooks,ylab="Cooks",xlim=c(0,200),
 main="Scottish Hill Races", pch=19, cex.lab = 1.5)
 text.id(fitted.values, scotcooks, c(7,11, 18), row.names(hills), cex.id=1.2)
 par(oldpar)


 library(forward)
 fwdhills <- fwdlm(time ~dist + climb, data=hills)


 summary(hills[,1:3])


 # Guete Simulation p.126
 # task view
 foo <- Map(function(e) { t.test(exp(rnorm(1000)), exp(rnorm(1000)))$p.value }, 1:1000)


 length(foo[foo<0.05])

	click1 <- function(){
	x <- runif(1);y <- runif(1)
	plot(x = x, y = y, xlim = c(0, 1), ylim = c(0, 1),
	main = "Bitte auf den Punkt klicken",
	xlab = '', ylab = '',
	axes = FALSE, frame.plot = TRUE)
	clicktime <- system.time(xyclick <- locator(1))
	c(timestamp = Sys.time(),
	x = x, y = y,
	xclick = xyclick$x, yclick = xyclick$y,
	tclick = clicktime[3])
	}

	# 1 click wegwerfen, 4-5 einfuehrungsclicks
	# lm modell, laufende nummer des clicks muss aufgenommen werden

	# besserer vergleich als qqplot ist linearisieren
	# vergleich gerade
	click <- function(count) {
	# 5 einfuehrungsklicks
	initClicks <- Map(function(i) {c(index=i, click1())}, 0:5)

	#clicks <- Map(function(i) { c(i, click1())}, 1:count)
	clicks <- list()
	lastClick <- tail(initClicks,1)[[1]] # truely elegant ...
	prevX <- lastClick["x"]
	prevY <- lastClick["y"]
	for (i in 1:count) {
	click <- c(index=i, click1())
	xclick <- click["xclick"]
	yclick <- click["yclick"]
	click <- c(click, dist=(click["x"] - xclick)2 + (click["y"] - yclick)2)
	click <- c(click, way=(xclick-prevX)2 + (yclick - prevY)2)
	clicks <- append(clicks, list(click))

	prevX <<- xclick
	prevY <<- yclick
	}
	clicks <- do.call(rbind.data.frame, clicks)
	colnames(clicks) <- c("index", "timestamp", "x", "y", "xclick", "yclick", "tclick", "dist", "way")
	clicks
	}

	# wie viele clicks sinnvoll fuer lm? ermuedung, test auf signifikanz?
	# rerun experiment
	write.table(click(20), "click20r")
	write.table(click(20), "click20l")

	write.table(click(20), "click20r2")
	write.table(click(20), "click20l2")

	# read stored data
	right <- rbind(read.table("click20r"), read.table("click20r2"))
	left <- rbind(read.table("click20l"), read.table("click20l2"))

	# not what is difference between left and right?

	# 0-hypothesis: are identical distributed

	# alternative
	# shift? in linear model
	# x_i = mu + e_i i=1,...,n1=n_x
	# y_i = mu + a + e_i i=1,.., n_2=n_y
	# t-test detects shift alternative

	# alternative
	# different distribution function
	# KS test between ecdfs, continuous cdfs necessary (?)


	# appending of lists and dataframes
	stack(list(left=left$tclick, right=right$tclick))

	# idea, compare time / way, should be normal distributed? t-test for significance

	?sample
	sample(1:12) # permutation

	wilcox.test( right$tclick, left$tclick ) # stetigkeitskorrektur

	require(coin)

	wilcox_test(right$tclick, left$tclick )



	hist(right$tclick / right$way)

	# empirical cdf, shift between left and right clearly visible
	plot(ecdf(left$tclick), col="green")
	lines(ecdf(right$tclick), col="red")
	legend(x=1,y=1,legend=c("left", "right"), fill=c("green", "red"))

	ks.test(left$tclick, right$tclick)
	# p-value: 0.001 (that dists are the same)


	# t-test:
	# empirical estimator for mean of two groups
	# simple case, both same variance, estimate variance
	t.test(left$tclick, right$tclick)

	t.test(left$tclick, right$tclick, var.equal=TRUE)
	# more degrees of freedom

	# linear model: (only if normal distributed)


	# qqplot comparison of time to click
	qqplot(right$tclick, left$tclick)

	ppplot(right$tclick, distf=rnorm)



	rlm <- lm(formula= right$tclick ~ right$way + right$index + right$dist)
	llm <- lm(formula= left$tclick ~ left$way + left$index + left$dist)

	require(graphics)

	par(mfrow = c(2,2))
	plot(rlm)

	par(mfrow = c(2,2))
	plot(llm)

	# monte carlo confidence bounds for theoretical distribution


	# qqplot 300 beobachtungen

	# genauigkeit, laenge des weges, anzahl der clicks


	unif50 <- runif(50)
	unif100 <- runif(100)
	norm50 <- rnorm(50)
	norm100 <- rnorm(100)
	lognorm50 <- exp(rnorm(50))
	lognorm100 <- exp( rnorm(100))

	oldpar <- par(mfrow = c(2, 3))
	plot(ecdf(unif50), pch = "[")
	plot(ecdf(norm50), pch = "[")
	plot(ecdf(lognorm50), pch = "[")
	plot(ecdf(unif100), pch = "[")
	plot(ecdf(norm100), pch = "[")
	plot(ecdf(lognorm100), pch = "[")
	par(oldpar)

	oldpar <- par(mfrow = c(2, 3))
	qqnorm(unif50, main = "Normal Q-Q unif50")
	qqnorm(norm50, main = "Normal Q-Q norm50")
	qqnorm(lognorm50, main = "Normal Q-Q lognorm50")
	qqnorm(unif100, main = "Normal Q-Q unif100")
	qqnorm(norm100, main = "Normal Q-Q norm100")
	qqnorm(lognorm100, main = "Normal Q-Q lognorm100")
	par(oldpar)




	tq99 <- function(n, df=1) {
	t100 <- rt(n, df)
	q <- quantile(t100, probs = seq(0.01, 0.99, 0.98))
	st100 <- sort(t100)
	ust100 <- st100[q["1%"]<st100]
	cst100 <- ust100[q["99%"]>ust100]
	cst100
	}




	# TODO why 19? Aufgabe 1.10: abstract P(T(X0) > T(Xi))
	ppplot <- function(x, y=NULL, samps = 19, distf=rnorm) {
	if(is.null(y)) {
	y <- sort(distf(length(x))) #(1:length(x))/length(x)
	} else {
	y <- sort(y)
	}
	plot(sort(x), y, xlab = substitute(x), ylab = expression(F[n]),
	main = "Verteilungsfunktion mit Monte-Carlo-Band",
	type = "s")
	mtext(paste(samps, "Monte-Carlo-Stichproben"), side = 3)
	samples <- matrix(distf(length(x)* samps), nrow = length(x), ncol = samps)
	samples <- apply(samples, 2, sort)
	envelope <- t(apply(samples, 1, range))
	lines(envelope[, 1], y, type = "s", col = "red");
	lines(envelope[, 2], y, type = "s", col = "red")
	}



	ppplot(norm100, tq99(100))

	ppplot(rnorm(300), rnorm(300))

	ks.test(rnorm(300), runif(300))














	# chisq vs. ks
	# https://stackoverflow.com/questions/21655653/r-chi-square-goodness-of-fit-for-random-numbers-generated

	set.seed(1)
	df <- data.frame(y = rexp(1000),
	z = rcauchy(1000, 100, 100))

	ks.test(df$y,"pexp")
	# One-sample Kolmogorov-Smirnov test
	#
	# data: df$y
	# D = 0.0387, p-value = 0.1001
	# alternative hypothesis: two-sided

	ks.test(df$z,"pcauchy",100,100)
	# One-sample Kolmogorov-Smirnov test
	#
	# data: df$z
	# D = 0.0296, p-value = 0.3455
	# alternative hypothesis: two-sided

	breaks <- c(seq(0,10,by=1))
	O <- table(cut(df$y,breaks=breaks))
	p <- diff(pexp(breaks))
	chisq.test(O,p=p, rescale.p=T)

	# Chi-squared test for given probabilities
	#
	# data: O
	# X-squared = 7.9911, df = 9, p-value = 0.535


	par(mfrow=c(1,2))
	plot(qexp(seq(0,1,0.01)),quantile(df$y,seq(0,1,0.01)),
	main="Q-Q Plot",ylab="df$Y", xlab="Exponential",
	xlim=c(0,5),ylim=c(0,5))
	plot(qcauchy(seq(0,.99,0.01),100,100),quantile(df$z,seq(0,.99,0.01)),
	main="Q-Q Plot",ylab="df$Z",xlab="Cauchy",
	xlim=c(-1000,1000),ylim=c(-1000,1000))


	# lm

	x <- 1:100
	err <- rnorm(100, mean = 0, sd = 10)
	y <- 2.5*x + err

	lm(y ~ x)

	lmres <- lm(y ~ x)
	plot(x, y)
	abline(lmres)



	attach(hills)
	scotlm<-lm(time ~ dist + climb)
	attach(scotlm)


	oldpar <- par(mfrow=c(1,2))
	plot(fitted.values, residuals,xlim=c(0,200), main="Scottish Hill Races", pch=19, cex.lab = 1.5)
	text.id(fitted.values, residuals, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
	sres<-stdres(scotlm)
	plot(fitted.values,sres,xlim=c(0,200), main="Scottish Hill Races",
	ylab="MASS::stdres", pch=19, cex.lab = 1.5)
	text.id(fitted.values,sres, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
	par(oldpar)


	oldpar <- par(mfrow=c(1,2))

	scotdffits <-dffits(scotlm)

	plot(fitted.values, scotdffits,ylab="DFFITS",xlim=c(0,200),
	main="Scottish Hill Races", pch=19, cex.lab = 1.5)
	text.id(fitted.values, scotdffits, c(7,11, 18,31,33,35), row.names(hills), cex.id=1.2)
	scotcooks <-cooks.distance(scotlm)
	plot(fitted.values, scotcooks,ylab="Cooks",xlim=c(0,200),
	main="Scottish Hill Races", pch=19, cex.lab = 1.5)
	text.id(fitted.values, scotcooks, c(7,11, 18), row.names(hills), cex.id=1.2)
	par(oldpar)


	library(forward)
	fwdhills <- fwdlm(time ~dist + climb, data=hills)


	summary(hills[,1:3])


	# Guete Simulation p.126
	# task view
	foo <- Map(function(e) { t.test(exp(rnorm(1000)), exp(rnorm(1000)))$p.value }, 1:1000)


	length(foo[foo<0.05])