Skip to content

Instantly share code, notes, and snippets.

@nickpettican
Last active June 15, 2016 15:14
Show Gist options
  • Save nickpettican/7ef9598326b22e75b940 to your computer and use it in GitHub Desktop.
Save nickpettican/7ef9598326b22e75b940 to your computer and use it in GitHub Desktop.
Stats01
# CHAPTER 1
# FUNDAMENTALS
# The response variable is the things you are working on, the variable whose variation you are attempting to understand.
# This is the variable that goes in the "y" axis of the graph (the ordinate).
# The explanatory variable goes on the "x" axis of the graph (the abscissa).
# The explanatory variables:
# - Regression: all explanatory variables continuous.
# - ANOVA: all explanatory variables categorical.
# - ANCOVA: some explanatory variables continuous some categorical.
# The response variable:
# - Regression, ANOVA or ANCOVA: continuous.
# - Logistic regression: proportion.
# - Log linear models: count.
# - Binary logistic analysis: binary.
# - Survival analysis: time at death.
# EVERYTHING VARIES:
# Heterogeneity is universal, so we need a way of discriminating between variation that is scientifically interesting
# and variation that just reflects background heterogeneity.
worms <- read.csv("C:\\MSc\\Statistics\\worms.csv")
# opens the file worms.csv
attach(worms)
# takes a "database"
names(worms)
# prints the name of the columns
# subscripts in R appear within scare brackets, thus y[7] is the 7th element of the vector "y"
# z[2,6] it is the second row of the 6th column.
# [, means all the rows
# ,] means all the columns
worms[,1:3]
# to view the first, second and third column.
worms[5:15,]
# to view the middle 11 rows of all the columns of the dataframe
worms[Area>3&Slope<3,]
# this selects onyl the rows which have Area > 3 and Slope <3 using ']
worms[order(Area),]
# this sorts the variable "Area" in ascending order
worms[rev(order(worms[,5])),c(5,7)]
# this sorts the 5th and 7th column into descending order
# SUMMARISING THE CONTENTS OF A DATAFRAME
summary(worms)
with(worms,tapply(Worm.density,Vegetation,mean))
# to find out the mean number under different variables
# for a single response variable you can use "tapply" and "with"
aggregate(worms[,c(2,3,5,7)], list(Vegetation),mean)
# "aggregate" calculates the mean values of all the continuous variables summarised at the same time
# "aggregate" assigns the word "Group.1" to the "Vegetation" column as a default
aggregate(worms[,c(2,3,5,7)], list(Communit=Vegetation),mean)
# this assigns the word "Community to the column instead
aggregate(worms[,c(2,3,5,7)], list(Moisture=Damp,Community=Vegetation),mean)
# you can do multiple classifications using two or more categorical explanatory variables
with(worms,tapply(Slope,list(Damp,Vegetation),mean))
# tapply operator produces NA for missing combinations
# GET TO KNOW YOUR DATA
head(data)
# prints the first 6 lines of the column
plot(y)
# produces a scatterplot with the values of "y"
which(y>10)
# prints the number of the line that has a value of >10
y[50]<-2.179386
# assigns the value 2.179386 to y[50]
table(treatment)
# this checks the factor levels of the variable "treatment"
# in this case we found a mistake in one of the entries
which(treatment=="nitogen")
# this finds the mistake: in this case "nitogen"
treatment[11]<-"nitrogen"
# this fixes the mistake, attributing the word "nitrogen" to the entry.
# Note that words need to be in brackets ""
# to save the file
write(x, file = "data",
ncolumns = if(is.character(x)) 1 else 5,
append = FALSE, sep = " ")
# it should work just by typing:
write.csv(name_of_the_file, file = "new_name.csv")
# when saving files it is worth setting a directory to save the files
getwd()
# this prints the current working directory
setwd("C:\\MSc\\Statistics\\Satistics")
# this sets the current directory
# note that "C:\\" must be in capitols
detach(data)
# this stops R from using the file "data" again
rm(data)
# this operator removes "data" from R
# RELATIONSHIPS
plot(x,y,pch=21,bg="red")
# the response variable is "y" and the explanatory variable is "x"
# this plots a graph with red spots as the values in the entries
# pch stands for plotting character, pch=21 gives a black edge and bright colour fill
# bg is background = red
plot(factor(month),upper)
# this produces a box-and-whisker plot which is useful for error-checking
# R produces this when the operator 'plot' is followed by a categorical explanatory variable ('factor(month)')
# LOOKING FOR INTERACTIONS BETWEEN CONTINUOUS VARIABLES
# we now have a table with three columns: x, y and z
par(mfrow=c(1,2))
# this alters the graphics parameter to specify two sets of axis on the same row
plot(x,y)
plot(z,y)
# now these two graphs are side by side
windows(7,7)
# windows operator changes the shape of the plotting window
# this command is not needed for RStudio
coplot(y~x|z,pch=16,panel=panel.smooth)
# coplot plots y against x conditional of (or given) the value of z
# "|" is read "given"
# this splits the data into six graphs
# pch=16 is for solid black plotting symbols
# panel=panel.smooth is to fit a trend line through the scatterplot in red
# it also produces shaded horizontal bars (shingles) which overlap as they share data points
# INTERACTIONS INVOLVING CATEGORICAL VARIABLES
par(mfrow=c(1,2))
# alters the graphics parameter to specify two sets of axis on the same row
plot(nitrogen,yield,main="Nitrog")
plot(phosphorus,yield,main="Phosph")
# plots show the 'main effects' of nitrogen and phosphorous in separate graphs
tapply(yield,list(nitrogen,phosphorus),mean)
# 'tapply' summarises variables
# useful for breaking up a vector into groups defined by some classifying factor, compute a factor on the subsets and returning the result in a convenient form
barplot(tapply(yield,list(nitrogen,phosphorus),mean),beside = TRUE,xlab = "phosphorous")
# barplot is used to output the interactions calculated by tapply graphically
# xlab is similar to 'main', it adds labels to the plot
legend(locator(1),legend = c("no","yes"),title="nitrogen",fill=c("black","lightgrey"))
# 'locator' allows to put the legend in a place where it does no interfere with any of the bars
# click to add the legend
# APPENDIX, ESSENTIALS OF THE R LANGUAGE
# R AS A CALCULATOR
log(42/7.3)
log(16,2)
5+6+3+6+4+2+4+8+3+2+7
2+3; 5*7; 3-7
8^(1/3)
abs(5.7-6.8)/0.38
options(digits=3)
# controls the number of digits printed
# BUILT-IN FUNCTIONS
exp(1)
# antilog function of the base e
pi
sin(pi/2)
cos(pi/2)
# MODULO AND INTERGER QUOTIENTS
119%/%13
# integer
119%%13
# modulo: whats left over
# modulo can be used to test whether numbers are odd or even
15421%%7==0
# modulo can also be used to find out whether one number is an exact multiple of another
# ASSIGNMENT
x<-5
# ROUNDING
floor(5.7)
ceiling(5.7)
rounded<-function(x)floor(x+0.5)
rounded(5.7)
# INFINITY AND THINGS THAT ARE NOT A NUMBER
3/0
# Inf
-12/0
# -Inf
exp(-Inf)
# 0
0/Inf
# 0
(0:3)^Inf
# 0:3 generates a series (0,1,2,3)
# It produces a vector of answers 0 1 Inf Inf
0/0
# NaN (Not a Number)
is.finite(10)
# TRUE
is.infinite(10)
# FALSE
# MISSING VALUES (NA)
x<-c(1:8,NA)
mean(x)
# this will come up NA
# the NA needs to be removed
mean(x,na.rm=T)
# this removes NA
(vmv<-C(1:6,NA,NA,9:12))
# attribute values to 'vmv'
seq(along=vmv)[is.na(vmv)]
# 'is.na(x)' locates the missing values whithin a vector
which(is.na(vmv)
# same result as before
(vmv[is.na(vmv)]<-0)
# this attributes '0' to the NA values
vmv<-c(1:6,NA,NA,9:12)
ifelse(is.na(vmv),0,vmv)
# 'ifelse' does the same
# CREATING A VECTOR
y<-10:16
y<-(10,11,12,13,14,15,16)
y<-scan()
# 'scan' manual input through keyboard
(counts<-c(25,12,7,4,6,2,1,0,2))
names(counts)<-0:8
st<-table(rpois(2000,2.3))
as.vector(st)
# this removes the names
data<-read.csv("c=\\MSc\\Stats\\daphnia.csv")
attach(data)
names(data)
tapply(Growth.rate,Detergent,mean)
tapply(Growth.rate,list(Water,Daphnia),median)
x<-0:10
sum(x)
sum(x<5)
x<5
1*(x<5)
x*(x<5)
sum(x*(x<5))
y<-c(8,3,5,7,6,6,8,9,2,3,9,4,10,4,11)
sort(y)
# sorts in ascending sequence
rev(sort(y))
# descending
rev(sort(y))[2]
rev(sort(y))[1:3]
sum(rev(sort(y))[1:3])
y
which(y>5)
# shows the addresses of the elements with values higher than 5
y[y>5]
length(y)
length(y[y>5])
# the number of elements in total
x<-c(5,8,6,7,1,5,3)
z<-x[-1]
z
trim.mean<-function(x)mean(sort(x)[-c(1,length(x))])
# it first sorts the vector
# removes the first element using x[-1] and last using x[-length(x)]
# -c(1,length(x)) concatenates both instructions
trim.mean(x)
# REPEATS
rep(9,5)
# 99999
rep(1:4,2)
# 12341234
rep(1:4,each=2)
#11223344
rep(1:4,each=2,times=3)
#112233441122334411223344
rep(c(9,15,21,83),c(4,1,4,2))
# 999915212121218383
# GENERATE FACTOR LEVELS
gl(4,3)
# 'gl' (generate levels
# 'gl' (up to, with repeats of, to total length)
gl(4,3,24)
gl(4,3,20)
gl(3,2,24,labels=c("A","B","C"))
# AABBCCAABBCCAABBCCAABBCC
# Levels: ABC
seq(0,1.5,0.2)
# we use 'seq' when the interval is not 1.0
# here we want to go from 0 to 1.5 in steps of 0.2
x<-rnorm(18,10,2)
seq(88,50,along=x)
# 'along' can create a sequence of the same length as an existing vector 'x'
# MATRICES
X<-matrix(c(1,0,0,0,1,0,0,0,1),nrow=3)
# [,1] [,2] [,3]
# [1,] 1 0 0
# [2,] 0 1 0
# [3,] 0 0 1
class(X)
attributes(X)
vector<-c(1,2,3,4,4,3,2,1)
V<-matrix(vector,byrow=T,nrow=2)
# this allows to convert a vector into a matrix
V
# [,1] [,2] [,3] [,4]
# [1,] 1 2 3 4
# [2,] 4 3 2 1
dim(vector)<-c(4,2)
is.matrix(vector)
# this is another way to convert a vector into a matrix
vector
# [,1] [,2]
# [1,] 1 4
# [2,] 2 3
# [3,] 3 2
# [4,] 4 1
(vector<-t(vector))
# transpose matrix
# CHARACTER STRINGS
phrase<- "the quick brown fox jumps over the lazy cow"
q<-character(20)
for(i in 1:20)q[i]<-substr(phrase,1,i)
q
# [1] "t" "th"
# [3] "the" "the "
# [5] "the q" "the qu"
# [7] "the qui" "the quic"
# [9] "the quick" "the quick "
# [11] "the quick b" "the quick br"
# [13] "the quick bro" "the quick brow"
# [15] "the quick brown" "the quick brown "
# [17] "the quick brown f" "the quick brown fo"
# [19] "the quick brown fox" "the quick brown fox "
strsplit(phrase,split=character(0))
# [[1]]
# [1] "t" "h" "e" " " "q" "u" "i" "c" "k" " " "b" "r" "o"
# [14] "w" "n" " " "f" "o" "x" " " "j" "u" "m" "p" "s" " "
# [27] "o" "v" "e" "r" " " "t" "h" "e" " " "l" "a" "z" "y"
# [40] " " "c" "o" "w"
table(strsplit(phrase,split=character(0)))
# a b c e f h i j k l m n o p q r s t u v w x y z
# 8 1 1 2 3 1 2 1 1 1 1 1 1 4 1 1 2 1 2 2 1 2 1 1 1
words<-1+table(strsplit(phrase,split=character(0)))[1]
toupper(phrase)
tolower(toupper(phrase))
# [1] "THE QUICK BROWN FOX JUMPS OVER THE LAZY COW"
# toupper makes the characters upper case
# tolower makes them lower case
# ARITHMETIC MEAN OF A SINGLE SAMPLE
arithmetic.mean<-function(x) sum(x)/length(x)
y<-c(3,3,4,5,5)
arithmetic.mean(y)
mean(y)
sort(y)[ceiling(length(y)/2)]
med<-function(x){
odd.even<-length(x)%%2
if(odd.evem==0) (sort(x)[length(x)/2]+sort(x)[1+length(x)/2])/2
else sort(x)[ceiling(length(x)/2)]
}
med(y)
med(y[-1])
# LOOPS AND REPEATS
for(i in 1:5)print(i^2)
binary<-function(x)
if(x==0)return(0)
i<-0
string<-numeric(32)
while(x>0) {
string[32-i]<-x%%2
x<-x%/%2
i<-i+1 }
first<-match(1,string)
string[first:32]
sapply(15:17,binary)
# THE IFELSE FUNCTION
z<-ifelse(y<0, -1,1)
# EVALUATING FUNCTIONS WITH APPLY
(X<-matrix(1:24,nrow=4))
apply(X,1,sum)
# 'apply' is used for applying functions to the rows or columns of matrices
# the answer produced by 'apply is a vector rather than a matrix
apply(X,1,sqrt)
# TESTING FOR EQUALITY
x<-sqrt(2)
x*x==2
x*x-2
# TESTING AND COERCING IN R
as.numeric(factor(c("a","b","c")))
as.numeric(c("a","b","c"))
# non-numeric characters cannot be coerced into numbers
geometric<-function(x){
if(!is.numeric(x)) stop("Input must be numeric")
exp(mean(log(x)))}
geometric(c("a","b","c"))
geometric<-function(x){
if(!is.numeric(x)) stop("Input must be numeric")
if(min(x)<=0) stop ("Input must be greater than zero")
exp(mean(log(x)))}
geometric(c(2,3,0,4))
geometric(c(10,1000,10,1,1))
# TIME AND DATES
Sys.time()
class(Sys.time())
time.list<-as.POSIXlt(Sys.time())
class(time.list)
unlist(time.list)
tim.list$wday
time.list$yday
Rdate<-strptime(date,"%d/%m/%Y")
class(Rdate)
tapply(x,Rdate$wday,mean)
y<-paste(hrs,min,sec,sep=":")
strptime(y,"%T")
(Rtime<-as.difftime(y))
tapply(Rtime,experiment,mean)
y2<-as.POSIXlt("2018-10-22")
y1<-as.POSIXlt("2018-10-22")
y2-y1
y3<-as.POSIXlt("2018-10-22 09:30:59")
y4<-as.POSIXlt("2018-10-22 12:45:06")
y4-y3
difftime("2018-10-22 12:45:06","2018-10-22 09:30:59")
y4>y3
# UNDERSTANDIN THE STRUCTURE OF AN R OBJECT USING 'str'
x<-runif(23)
str(x)
basket<-list(rep("a",4),c("b0","b1","b2"),9:4,gl(5,3))
basket
str(basket)
xv<-seq(0,30)
yv<-2+0.5*xv+rnorm(31,0,2)
model<-lm(yv~xv+I(xv^2))
str(model)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment