Last active
November 9, 2015 23:01
-
-
Save nickpettican/1d227863e7bb08d9e7c6 to your computer and use it in GitHub Desktop.
Session02
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CHAPTER 3 | |
#ENTRAL TENDENCY | |
yvals<-read.csv("C:\\MSc\\Statistics\\Data\\yvalues.csv") | |
# open file | |
attach(yvals) | |
hist(yvals) | |
# the data values that occour the most are called 'mode' | |
# the most straighforward quantitative measure of central tendency is the arithmetic mean of the data | |
total<-sum(y) | |
# 'sum' works out the total for any length of vector | |
n<-length(y) | |
# calculates the length of all data values | |
ybar<-total/n | |
# calculates the arithmetic mean | |
arithmetic.mean<-function(x)sum(x)/length(x) | |
# we define a function | |
data<-(3,4,6,7) | |
# we define a new set of values in order to test the data | |
arithmetic.mean(data) | |
# it works | |
airthmetic.mean(y) | |
# R has built-in functions for calculating arithmetic means directly | |
mean(y) | |
# however, the arithmetic mean is highly sensitive to outliners | |
# a single extremely large/small value will have a big effect on the arithmetic mean | |
# so we need to use the median which is the middle value of the data set | |
# to do this: | |
sorted<-sort(y) | |
# sorts the data into ascending order | |
length(y)/2 | |
# number of numbers, the middle value is half of this | |
# it has to be odd ta have a middle value | |
ceiling(length(y)/2) | |
# converts the value of length(y)/2 into an integer | |
sorted[20] | |
# extracts the median value of y | |
sorted[ceiling(length(y)/2)] | |
# a more general way of writing it | |
sort(y)[ceiling(length(y)/2)] | |
# another more general way of defining it | |
# in the case that the vector contains an even number of numbers | |
y.even<-y[-1] | |
length(y.even) | |
# we need to drop the first element called y | |
# we will work out the arithmetic average of the two values of y on either side of the middle | |
# in this case, the average of the 19th and 20th sorted values | |
sort(y.even)[19] | |
sort(y.even)[20] | |
# so in this case the median would be: | |
(sort(y.even)[19]+sort(y.even)[20])/2 | |
# but we can replace 19 and 20 with 'ceiling(length(y.even)/2)' and 'ceiling(1+length(y.even)/2)' | |
# there is trick to find out if the length is odd or even: 'modulo', which is the remainder when one integer is divided by another | |
# even = modulo 0 and odd = modulo 1 | |
# modulo funtion is '%%' | |
38%%2 | |
# 0 | |
39%%2 | |
# 1 | |
med<-function(x){ | |
modulo<-length(x)%%2 | |
if (modulo==0) (sort(x)[ceiling(length(x)/2)]+sort(x)[ceiling(1+length(x)/2)])/2 | |
else sort(x)[ceiling(length(x)/2)] | |
} | |
# if we have an even number of number then the 'if' statement in evaluated | |
# if we have an odd number of number (modulo==1) then else is evaluated | |
med(y) | |
med(y.even) | |
# there is a built-in function for calculating the median | |
median(y) | |
median(y.even) | |
# PROCESSES THAT CHANGE MULTIPLICATIVELY | |
# in these conditions the appropriate measure is the geometric mean | |
# definition: the "n"th root of the product of the data | |
# E.G. 10,1,1000,1,10 multiplying them together give 100000; we want the fifth root of this (5 numbers) | |
# roots are fractional powers, so the 5th root 1/5=0.2 | |
# R denotes 'powers' by '^' | |
100000^0.2 | |
# 10 | |
# the arithmetic mean on the other hand would give this | |
insects<-c(1,10,1000,10,1) | |
mean(insects) | |
# 204.4 | |
# another way to calculate the geometric mean is using algorithms | |
# we should be able to calculate it by finding the antilog (exp) of the average of the algorithms (log) of the data | |
exp(mean(log(insects))) | |
# 10 | |
# THE HARMONIC MEAN | |
# this is the reciprocal of the average of the reciprocals | |
# a reciprocal is 'one over' | |
# the average of our reciprocals is 'average' and the reciprocal of this average is the harmonic mean | |
v<-c(1,2,4,1) | |
length(v)/sum(1/v) | |
# or | |
1/mean(1/v) | |
# CHAPTER 4 | |
#VARIENCE | |
# measure of variability is very important | |
# the greater the variability the greater will be our uncertainty in the values of parameters estimated from the data | |
# and the lower will be our ability to distinguish between competing hypotheses about the data | |
y<-c(13,7,5,12,9,15,6,11,9,7,12) | |
plot(y) | |
range(y) | |
# this quantifies the variation | |
plot(1:11,y,ylim = c(0,20),pch=16,col="blue") | |
lines(c(4.5,4.5),c(5,15),col="brown") | |
lines(c(4.5,3.5),c(5,5),col="brown") | |
lines(c(4.5,5.5),c(15,15),col="brown") | |
# plots a graph with lines delimiting the maximum and minimum values | |
# this is too dependent on the outlying values | |
plot(1:11,y,ylim = c(0,20),pch=16,col="blue") | |
abline(h=mean(y),col="green") | |
for(i in 1:11)lines(c(i,i),c(mean(y),y[i]),col="red") | |
# this ensures all the data contributes to the measure of variability | |
# it also estimates de mean value (green) | |
# the (red) lines go from the mean to the values, the longer they are the more variable the data | |
# adding the lengths of the red lines proves unyielding as the top positive lines will be cancelled by the negative | |
# so we do the SUM OF SQUARES | |
y-mean(y) | |
(y-mean(y))^2 | |
sum((y-mean(y))^2) | |
# VARIANCE | |
variance<-function(x) sum((x-mean(x))^2)/(length(x)-1) | |
variance(y) | |
var(y) | |
ozone<-read.csv("c:\\MSc\\Statistics\\Data\\gardens.csv") | |
attach(ozone) | |
ozone | |
mean(gardenA) | |
gardenA-mean(gardenA) | |
(gardenA-mean(gardenA))^2 | |
sum((gardenA-mean(gardenA))^2)/9 | |
mean(gardenB) | |
gardenB-mean(gardenB) | |
(gardenB-mean(gardenB))^2 | |
sum((gardenB-mean(gardenB))^2) | |
sum((gardenB-mean(gardenB))^2)/9 | |
mean(gardenC) | |
gardenC-mean(gardenC) | |
(gardenC-mean(gardenC))^2 | |
sum((gardenC-mean(gardenC))^2) | |
sum((gardenC-mean(gardenC))^2)/9 | |
var(gardenC)/var(gardenB) | |
2*(1-pf(10.667,9,9)) | |
var.test(gardenB,gardenC) | |
plot(c(0,32),c(0,15),type="n",xlab="Samplesize",ylab="Variance") | |
for(n in seq(3,31,2)){ | |
for(i in 1:30){ | |
x<-norm(n,mean=10,sd=2) | |
points(n,var(x))}} | |
sqrt(var(gardenA)/10) | |
sqrt(var(gardenB)/10) | |
sqrt(var(gardenC)/10) | |
qt(.025,9) | |
qt(.975,9) | |
qt(.995,9) | |
qt(.9975,9) | |
qt(.975,9)*sqrt(1.33333/10) | |
# BOOTSTRAP | |
data<-read.csv("c:\\MSc\\Statistics\\Data\\skewdata.csv") | |
attach(data) | |
names(data) | |
plot(c(0,30),c(0,60),type="n",xlab="Sample size", | |
ylab="Confidence interval") | |
for(k in seq(5,30,3)){ | |
a<-numeric(10000) | |
for(i in 1;10000){ | |
a[i]<-mean(sample(values,k,replace=T)) | |
} | |
points(c(k,k),quantile(a,c(.025,.975)),type="b",pch=21,bg="red") | |
quantile(a,c(0.025,0.975)) | |
xv<-seq(5,30,0.1) | |
yv<-mean(values)+1.96*sqrt(var(values)/xv) | |
lines(xv,yv,col="blue") | |
yv<-mean(values)-1.96*sqrt(var(values)/xv) | |
lines(xv,yv,col="blue") | |
yv<-mean(values)-qt(.975,xv-1)*sqrt(var(values)/xv) | |
lines(xv,yv,lty=2,col="green") | |
yv<-mean(values)+qt(.975,xv-1)*sqrt(var(values)/xv) | |
lines(xv,yv,lty=2,col="green") | |
# CHAPTER 5 | |
#DATA SUMMARY IN THE ONE-SAMPLE CASE | |
data<-read.csv("c:\\MSc\\Statistics\\Data\\example.csv") | |
attach(data) | |
summary(y) | |
boxplot(y) | |
hist(y) | |
length(table(y)) | |
plot(range(y),c(0,10),type="n",xlab="yvalues",ylab="") | |
for(i in 1:100)lines(c(y[i],y[i]),c(0,1),col="blue") | |
(max(y)-min(y))/10 | |
diff(range(y))/11 | |
# NORMAL DISTRIBUTION | |
score<-2:12 | |
ways<-c(1,2,3,4,5,6,5,4,3,2,1) | |
game<-rep(score,ways) | |
game | |
sample(game,1) | |
outcome<-numeric(10000) | |
for(i in 1:10000) outcome[i]<-sample(game,1) | |
hist(outcome,breaks=(1.5:12.5)) | |
mean.score<-numeric(10000) | |
for(i in 1:10000)mean.score[i]<-mean(sample(game,3)) | |
hist(mean.score,break=(1.5:12.5)) | |
mean(mean.score) | |
sd(mean.score) | |
xv<-seq(2,12,0.1) | |
yv<-10000*dnorm(xv,mean(mean.score),sd(mean.score)) | |
hist(mean.score,breaks=(1.5:12.5),ylim=c(0,3000), | |
col="yellow",main="") | |
lines (xv,yv,col="red") | |
standard.deviations<-seq(-3,3,0.01) | |
pd<-dnorm(standard.deviations) | |
plot(standard.deviations,pd,type="l",col="blue") | |
pnorm(-2) | |
pnorm(-1) | |
1-pnorm(3) | |
qnorm(c(0.025,0.0975)) | |
ht<-seq(150,190,0.01) | |
plot(ht,dnorm(ht,170,8),type="l",col="brown", | |
ylab="Probability density",xlab="Height") | |
pnorm(-1.25) | |
pnorm(1.875) | |
1-pnorm(1.875) | |
pnorm(1.25)-pnorm(-0.625) | |
par(mfrow=c(2,2)) | |
ht<-seq(150,190,0.01) | |
pd<-dnorm(ht,170,8) | |
plot(ht,dnorm(ht,170,8),type="l",col="brown", | |
ylab="Probability density",xlab="Height") | |
plot(ht,dnorm(ht,170,8),type="l",col="brown", | |
ylab="Probability density",xlab="Height") | |
yv<-pd[ht<=160] | |
xv<-ht[ht<=160] | |
xv<-c(xv,160,150) | |
yv<-c(yv,yv[1],yv[1]) | |
polygon(xv,yv,col="orange") | |
plot(ht,dnorm(ht,170,8),type="l",col="brown", | |
ylab="Probability density",xlab="Height") | |
xv<-ht[ht>=185] | |
yv<-pd[ht>=185] | |
xv<-c(xv,190,185) | |
yv<-c(yv,yv[501],yv[501]) | |
polygon(xv,yv,col="blue") | |
plot(ht,dnorm(ht,170,8),type="l",col="brown", | |
ylab="Probability density",xlab="Height") | |
xv<-ht[ht>=160 & ht<=180] | |
vv<-ht[ht>=160 & ht<=180] | |
xv<-c(xv,180,160) | |
yv<-c(yv,pd[1],pd[1]) | |
polygon(xv,yv,col="green") | |
# PLOTS FOR TESTING NORMALITY OF SINGLE SAMPLES | |
data<-read.csv("c:\\MSc\\Statistics\\Data\\skewdata.csv") | |
attach(data) | |
qqnorm(values) | |
qqline(values,lty=2) | |
light<-read.csv("c:\\MSc\\Statistics\\Data\\light.csv") | |
attach(light) | |
names(light) | |
hist(speed) | |
summary(speed) | |
# INFERENCE IN THE ON-SAMPLE CASE | |
wilcox.test(speed,mu=990) | |
# BOOTSTRAP IN HYPOTHESIS TESTING WITH SINGLE SAMPLES | |
a<-numeric(10000) | |
for(i in 1:10000) a[i]<-mean(sample(speed,replace=T)) | |
hist(a) | |
max(a) | |
# STUDENT'S t DISTRIBUTION | |
plot(c(0,30)c(0,10),type="n", | |
xlab="Degrees of freedom", | |
ylab="Students t value") | |
lines(1:30,qt(0.975,df=1:30),col="red") | |
abline(h=1.96,lty=2,col="green") | |
xvs<-seq(-4,4,0.01) | |
plot(xvs,dnorm(xvs),type="l", | |
ylab="Probability density",xlab="Deviates") | |
lines(xvs,dt(xvs,df=5),col="red") | |
qt(0.975,5) | |
# SKEW | |
skew<-function(x){ | |
m3<-sum((x-mean(x))^3)/length(x) | |
s3<-sqrt(var(x))^3 | |
m3/s3 } | |
hist(values,main="",col="green") | |
skew(values) | |
skew(values)/sqrt(6/length(values)) | |
1-pt(2.949,28) | |
skew(sqrt(values))/sqrt(6/length(values)) | |
skew(log(values))/sqrt(6/length(values)) | |
# KURTOSIS | |
kurtosis<-function(x){ | |
m4<-sum((x-mean(x))^4)/length(x) | |
s4<-var(x)^2 | |
m4/s4-3 } | |
kurtosis(values) | |
kurtosis(values)/sqrt(24/length(values)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment