Skip to content

Instantly share code, notes, and snippets.

@nickpettican
Last active November 9, 2015 23:01
Show Gist options
  • Save nickpettican/1d227863e7bb08d9e7c6 to your computer and use it in GitHub Desktop.
Save nickpettican/1d227863e7bb08d9e7c6 to your computer and use it in GitHub Desktop.
Session02
# CHAPTER 3
#ENTRAL TENDENCY
yvals<-read.csv("C:\\MSc\\Statistics\\Data\\yvalues.csv")
# open file
attach(yvals)
hist(yvals)
# the data values that occour the most are called 'mode'
# the most straighforward quantitative measure of central tendency is the arithmetic mean of the data
total<-sum(y)
# 'sum' works out the total for any length of vector
n<-length(y)
# calculates the length of all data values
ybar<-total/n
# calculates the arithmetic mean
arithmetic.mean<-function(x)sum(x)/length(x)
# we define a function
data<-(3,4,6,7)
# we define a new set of values in order to test the data
arithmetic.mean(data)
# it works
airthmetic.mean(y)
# R has built-in functions for calculating arithmetic means directly
mean(y)
# however, the arithmetic mean is highly sensitive to outliners
# a single extremely large/small value will have a big effect on the arithmetic mean
# so we need to use the median which is the middle value of the data set
# to do this:
sorted<-sort(y)
# sorts the data into ascending order
length(y)/2
# number of numbers, the middle value is half of this
# it has to be odd ta have a middle value
ceiling(length(y)/2)
# converts the value of length(y)/2 into an integer
sorted[20]
# extracts the median value of y
sorted[ceiling(length(y)/2)]
# a more general way of writing it
sort(y)[ceiling(length(y)/2)]
# another more general way of defining it
# in the case that the vector contains an even number of numbers
y.even<-y[-1]
length(y.even)
# we need to drop the first element called y
# we will work out the arithmetic average of the two values of y on either side of the middle
# in this case, the average of the 19th and 20th sorted values
sort(y.even)[19]
sort(y.even)[20]
# so in this case the median would be:
(sort(y.even)[19]+sort(y.even)[20])/2
# but we can replace 19 and 20 with 'ceiling(length(y.even)/2)' and 'ceiling(1+length(y.even)/2)'
# there is trick to find out if the length is odd or even: 'modulo', which is the remainder when one integer is divided by another
# even = modulo 0 and odd = modulo 1
# modulo funtion is '%%'
38%%2
# 0
39%%2
# 1
med<-function(x){
modulo<-length(x)%%2
if (modulo==0) (sort(x)[ceiling(length(x)/2)]+sort(x)[ceiling(1+length(x)/2)])/2
else sort(x)[ceiling(length(x)/2)]
}
# if we have an even number of number then the 'if' statement in evaluated
# if we have an odd number of number (modulo==1) then else is evaluated
med(y)
med(y.even)
# there is a built-in function for calculating the median
median(y)
median(y.even)
# PROCESSES THAT CHANGE MULTIPLICATIVELY
# in these conditions the appropriate measure is the geometric mean
# definition: the "n"th root of the product of the data
# E.G. 10,1,1000,1,10 multiplying them together give 100000; we want the fifth root of this (5 numbers)
# roots are fractional powers, so the 5th root 1/5=0.2
# R denotes 'powers' by '^'
100000^0.2
# 10
# the arithmetic mean on the other hand would give this
insects<-c(1,10,1000,10,1)
mean(insects)
# 204.4
# another way to calculate the geometric mean is using algorithms
# we should be able to calculate it by finding the antilog (exp) of the average of the algorithms (log) of the data
exp(mean(log(insects)))
# 10
# THE HARMONIC MEAN
# this is the reciprocal of the average of the reciprocals
# a reciprocal is 'one over'
# the average of our reciprocals is 'average' and the reciprocal of this average is the harmonic mean
v<-c(1,2,4,1)
length(v)/sum(1/v)
# or
1/mean(1/v)
# CHAPTER 4
#VARIENCE
# measure of variability is very important
# the greater the variability the greater will be our uncertainty in the values of parameters estimated from the data
# and the lower will be our ability to distinguish between competing hypotheses about the data
y<-c(13,7,5,12,9,15,6,11,9,7,12)
plot(y)
range(y)
# this quantifies the variation
plot(1:11,y,ylim = c(0,20),pch=16,col="blue")
lines(c(4.5,4.5),c(5,15),col="brown")
lines(c(4.5,3.5),c(5,5),col="brown")
lines(c(4.5,5.5),c(15,15),col="brown")
# plots a graph with lines delimiting the maximum and minimum values
# this is too dependent on the outlying values
plot(1:11,y,ylim = c(0,20),pch=16,col="blue")
abline(h=mean(y),col="green")
for(i in 1:11)lines(c(i,i),c(mean(y),y[i]),col="red")
# this ensures all the data contributes to the measure of variability
# it also estimates de mean value (green)
# the (red) lines go from the mean to the values, the longer they are the more variable the data
# adding the lengths of the red lines proves unyielding as the top positive lines will be cancelled by the negative
# so we do the SUM OF SQUARES
y-mean(y)
(y-mean(y))^2
sum((y-mean(y))^2)
# VARIANCE
variance<-function(x) sum((x-mean(x))^2)/(length(x)-1)
variance(y)
var(y)
ozone<-read.csv("c:\\MSc\\Statistics\\Data\\gardens.csv")
attach(ozone)
ozone
mean(gardenA)
gardenA-mean(gardenA)
(gardenA-mean(gardenA))^2
sum((gardenA-mean(gardenA))^2)/9
mean(gardenB)
gardenB-mean(gardenB)
(gardenB-mean(gardenB))^2
sum((gardenB-mean(gardenB))^2)
sum((gardenB-mean(gardenB))^2)/9
mean(gardenC)
gardenC-mean(gardenC)
(gardenC-mean(gardenC))^2
sum((gardenC-mean(gardenC))^2)
sum((gardenC-mean(gardenC))^2)/9
var(gardenC)/var(gardenB)
2*(1-pf(10.667,9,9))
var.test(gardenB,gardenC)
plot(c(0,32),c(0,15),type="n",xlab="Samplesize",ylab="Variance")
for(n in seq(3,31,2)){
for(i in 1:30){
x<-norm(n,mean=10,sd=2)
points(n,var(x))}}
sqrt(var(gardenA)/10)
sqrt(var(gardenB)/10)
sqrt(var(gardenC)/10)
qt(.025,9)
qt(.975,9)
qt(.995,9)
qt(.9975,9)
qt(.975,9)*sqrt(1.33333/10)
# BOOTSTRAP
data<-read.csv("c:\\MSc\\Statistics\\Data\\skewdata.csv")
attach(data)
names(data)
plot(c(0,30),c(0,60),type="n",xlab="Sample size",
ylab="Confidence interval")
for(k in seq(5,30,3)){
a<-numeric(10000)
for(i in 1;10000){
a[i]<-mean(sample(values,k,replace=T))
}
points(c(k,k),quantile(a,c(.025,.975)),type="b",pch=21,bg="red")
quantile(a,c(0.025,0.975))
xv<-seq(5,30,0.1)
yv<-mean(values)+1.96*sqrt(var(values)/xv)
lines(xv,yv,col="blue")
yv<-mean(values)-1.96*sqrt(var(values)/xv)
lines(xv,yv,col="blue")
yv<-mean(values)-qt(.975,xv-1)*sqrt(var(values)/xv)
lines(xv,yv,lty=2,col="green")
yv<-mean(values)+qt(.975,xv-1)*sqrt(var(values)/xv)
lines(xv,yv,lty=2,col="green")
# CHAPTER 5
#DATA SUMMARY IN THE ONE-SAMPLE CASE
data<-read.csv("c:\\MSc\\Statistics\\Data\\example.csv")
attach(data)
summary(y)
boxplot(y)
hist(y)
length(table(y))
plot(range(y),c(0,10),type="n",xlab="yvalues",ylab="")
for(i in 1:100)lines(c(y[i],y[i]),c(0,1),col="blue")
(max(y)-min(y))/10
diff(range(y))/11
# NORMAL DISTRIBUTION
score<-2:12
ways<-c(1,2,3,4,5,6,5,4,3,2,1)
game<-rep(score,ways)
game
sample(game,1)
outcome<-numeric(10000)
for(i in 1:10000) outcome[i]<-sample(game,1)
hist(outcome,breaks=(1.5:12.5))
mean.score<-numeric(10000)
for(i in 1:10000)mean.score[i]<-mean(sample(game,3))
hist(mean.score,break=(1.5:12.5))
mean(mean.score)
sd(mean.score)
xv<-seq(2,12,0.1)
yv<-10000*dnorm(xv,mean(mean.score),sd(mean.score))
hist(mean.score,breaks=(1.5:12.5),ylim=c(0,3000),
col="yellow",main="")
lines (xv,yv,col="red")
standard.deviations<-seq(-3,3,0.01)
pd<-dnorm(standard.deviations)
plot(standard.deviations,pd,type="l",col="blue")
pnorm(-2)
pnorm(-1)
1-pnorm(3)
qnorm(c(0.025,0.0975))
ht<-seq(150,190,0.01)
plot(ht,dnorm(ht,170,8),type="l",col="brown",
ylab="Probability density",xlab="Height")
pnorm(-1.25)
pnorm(1.875)
1-pnorm(1.875)
pnorm(1.25)-pnorm(-0.625)
par(mfrow=c(2,2))
ht<-seq(150,190,0.01)
pd<-dnorm(ht,170,8)
plot(ht,dnorm(ht,170,8),type="l",col="brown",
ylab="Probability density",xlab="Height")
plot(ht,dnorm(ht,170,8),type="l",col="brown",
ylab="Probability density",xlab="Height")
yv<-pd[ht<=160]
xv<-ht[ht<=160]
xv<-c(xv,160,150)
yv<-c(yv,yv[1],yv[1])
polygon(xv,yv,col="orange")
plot(ht,dnorm(ht,170,8),type="l",col="brown",
ylab="Probability density",xlab="Height")
xv<-ht[ht>=185]
yv<-pd[ht>=185]
xv<-c(xv,190,185)
yv<-c(yv,yv[501],yv[501])
polygon(xv,yv,col="blue")
plot(ht,dnorm(ht,170,8),type="l",col="brown",
ylab="Probability density",xlab="Height")
xv<-ht[ht>=160 & ht<=180]
vv<-ht[ht>=160 & ht<=180]
xv<-c(xv,180,160)
yv<-c(yv,pd[1],pd[1])
polygon(xv,yv,col="green")
# PLOTS FOR TESTING NORMALITY OF SINGLE SAMPLES
data<-read.csv("c:\\MSc\\Statistics\\Data\\skewdata.csv")
attach(data)
qqnorm(values)
qqline(values,lty=2)
light<-read.csv("c:\\MSc\\Statistics\\Data\\light.csv")
attach(light)
names(light)
hist(speed)
summary(speed)
# INFERENCE IN THE ON-SAMPLE CASE
wilcox.test(speed,mu=990)
# BOOTSTRAP IN HYPOTHESIS TESTING WITH SINGLE SAMPLES
a<-numeric(10000)
for(i in 1:10000) a[i]<-mean(sample(speed,replace=T))
hist(a)
max(a)
# STUDENT'S t DISTRIBUTION
plot(c(0,30)c(0,10),type="n",
xlab="Degrees of freedom",
ylab="Students t value")
lines(1:30,qt(0.975,df=1:30),col="red")
abline(h=1.96,lty=2,col="green")
xvs<-seq(-4,4,0.01)
plot(xvs,dnorm(xvs),type="l",
ylab="Probability density",xlab="Deviates")
lines(xvs,dt(xvs,df=5),col="red")
qt(0.975,5)
# SKEW
skew<-function(x){
m3<-sum((x-mean(x))^3)/length(x)
s3<-sqrt(var(x))^3
m3/s3 }
hist(values,main="",col="green")
skew(values)
skew(values)/sqrt(6/length(values))
1-pt(2.949,28)
skew(sqrt(values))/sqrt(6/length(values))
skew(log(values))/sqrt(6/length(values))
# KURTOSIS
kurtosis<-function(x){
m4<-sum((x-mean(x))^4)/length(x)
s4<-var(x)^2
m4/s4-3 }
kurtosis(values)
kurtosis(values)/sqrt(24/length(values))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment