Last active
June 15, 2016 15:14
-
-
Save nickpettican/7ef9598326b22e75b940 to your computer and use it in GitHub Desktop.
Stats01
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CHAPTER 1 | |
# FUNDAMENTALS | |
# The response variable is the things you are working on, the variable whose variation you are attempting to understand. | |
# This is the variable that goes in the "y" axis of the graph (the ordinate). | |
# The explanatory variable goes on the "x" axis of the graph (the abscissa). | |
# The explanatory variables: | |
# - Regression: all explanatory variables continuous. | |
# - ANOVA: all explanatory variables categorical. | |
# - ANCOVA: some explanatory variables continuous some categorical. | |
# The response variable: | |
# - Regression, ANOVA or ANCOVA: continuous. | |
# - Logistic regression: proportion. | |
# - Log linear models: count. | |
# - Binary logistic analysis: binary. | |
# - Survival analysis: time at death. | |
# EVERYTHING VARIES: | |
# Heterogeneity is universal, so we need a way of discriminating between variation that is scientifically interesting | |
# and variation that just reflects background heterogeneity. | |
worms <- read.csv("C:\\MSc\\Statistics\\worms.csv") | |
# opens the file worms.csv | |
attach(worms) | |
# takes a "database" | |
names(worms) | |
# prints the name of the columns | |
# subscripts in R appear within scare brackets, thus y[7] is the 7th element of the vector "y" | |
# z[2,6] it is the second row of the 6th column. | |
# [, means all the rows | |
# ,] means all the columns | |
worms[,1:3] | |
# to view the first, second and third column. | |
worms[5:15,] | |
# to view the middle 11 rows of all the columns of the dataframe | |
worms[Area>3&Slope<3,] | |
# this selects onyl the rows which have Area > 3 and Slope <3 using '] | |
worms[order(Area),] | |
# this sorts the variable "Area" in ascending order | |
worms[rev(order(worms[,5])),c(5,7)] | |
# this sorts the 5th and 7th column into descending order | |
# SUMMARISING THE CONTENTS OF A DATAFRAME | |
summary(worms) | |
with(worms,tapply(Worm.density,Vegetation,mean)) | |
# to find out the mean number under different variables | |
# for a single response variable you can use "tapply" and "with" | |
aggregate(worms[,c(2,3,5,7)], list(Vegetation),mean) | |
# "aggregate" calculates the mean values of all the continuous variables summarised at the same time | |
# "aggregate" assigns the word "Group.1" to the "Vegetation" column as a default | |
aggregate(worms[,c(2,3,5,7)], list(Communit=Vegetation),mean) | |
# this assigns the word "Community to the column instead | |
aggregate(worms[,c(2,3,5,7)], list(Moisture=Damp,Community=Vegetation),mean) | |
# you can do multiple classifications using two or more categorical explanatory variables | |
with(worms,tapply(Slope,list(Damp,Vegetation),mean)) | |
# tapply operator produces NA for missing combinations | |
# GET TO KNOW YOUR DATA | |
head(data) | |
# prints the first 6 lines of the column | |
plot(y) | |
# produces a scatterplot with the values of "y" | |
which(y>10) | |
# prints the number of the line that has a value of >10 | |
y[50]<-2.179386 | |
# assigns the value 2.179386 to y[50] | |
table(treatment) | |
# this checks the factor levels of the variable "treatment" | |
# in this case we found a mistake in one of the entries | |
which(treatment=="nitogen") | |
# this finds the mistake: in this case "nitogen" | |
treatment[11]<-"nitrogen" | |
# this fixes the mistake, attributing the word "nitrogen" to the entry. | |
# Note that words need to be in brackets "" | |
# to save the file | |
write(x, file = "data", | |
ncolumns = if(is.character(x)) 1 else 5, | |
append = FALSE, sep = " ") | |
# it should work just by typing: | |
write.csv(name_of_the_file, file = "new_name.csv") | |
# when saving files it is worth setting a directory to save the files | |
getwd() | |
# this prints the current working directory | |
setwd("C:\\MSc\\Statistics\\Satistics") | |
# this sets the current directory | |
# note that "C:\\" must be in capitols | |
detach(data) | |
# this stops R from using the file "data" again | |
rm(data) | |
# this operator removes "data" from R | |
# RELATIONSHIPS | |
plot(x,y,pch=21,bg="red") | |
# the response variable is "y" and the explanatory variable is "x" | |
# this plots a graph with red spots as the values in the entries | |
# pch stands for plotting character, pch=21 gives a black edge and bright colour fill | |
# bg is background = red | |
plot(factor(month),upper) | |
# this produces a box-and-whisker plot which is useful for error-checking | |
# R produces this when the operator 'plot' is followed by a categorical explanatory variable ('factor(month)') | |
# LOOKING FOR INTERACTIONS BETWEEN CONTINUOUS VARIABLES | |
# we now have a table with three columns: x, y and z | |
par(mfrow=c(1,2)) | |
# this alters the graphics parameter to specify two sets of axis on the same row | |
plot(x,y) | |
plot(z,y) | |
# now these two graphs are side by side | |
windows(7,7) | |
# windows operator changes the shape of the plotting window | |
# this command is not needed for RStudio | |
coplot(y~x|z,pch=16,panel=panel.smooth) | |
# coplot plots y against x conditional of (or given) the value of z | |
# "|" is read "given" | |
# this splits the data into six graphs | |
# pch=16 is for solid black plotting symbols | |
# panel=panel.smooth is to fit a trend line through the scatterplot in red | |
# it also produces shaded horizontal bars (shingles) which overlap as they share data points | |
# INTERACTIONS INVOLVING CATEGORICAL VARIABLES | |
par(mfrow=c(1,2)) | |
# alters the graphics parameter to specify two sets of axis on the same row | |
plot(nitrogen,yield,main="Nitrog") | |
plot(phosphorus,yield,main="Phosph") | |
# plots show the 'main effects' of nitrogen and phosphorous in separate graphs | |
tapply(yield,list(nitrogen,phosphorus),mean) | |
# 'tapply' summarises variables | |
# useful for breaking up a vector into groups defined by some classifying factor, compute a factor on the subsets and returning the result in a convenient form | |
barplot(tapply(yield,list(nitrogen,phosphorus),mean),beside = TRUE,xlab = "phosphorous") | |
# barplot is used to output the interactions calculated by tapply graphically | |
# xlab is similar to 'main', it adds labels to the plot | |
legend(locator(1),legend = c("no","yes"),title="nitrogen",fill=c("black","lightgrey")) | |
# 'locator' allows to put the legend in a place where it does no interfere with any of the bars | |
# click to add the legend | |
# APPENDIX, ESSENTIALS OF THE R LANGUAGE | |
# R AS A CALCULATOR | |
log(42/7.3) | |
log(16,2) | |
5+6+3+6+4+2+4+8+3+2+7 | |
2+3; 5*7; 3-7 | |
8^(1/3) | |
abs(5.7-6.8)/0.38 | |
options(digits=3) | |
# controls the number of digits printed | |
# BUILT-IN FUNCTIONS | |
exp(1) | |
# antilog function of the base e | |
pi | |
sin(pi/2) | |
cos(pi/2) | |
# MODULO AND INTERGER QUOTIENTS | |
119%/%13 | |
# integer | |
119%%13 | |
# modulo: whats left over | |
# modulo can be used to test whether numbers are odd or even | |
15421%%7==0 | |
# modulo can also be used to find out whether one number is an exact multiple of another | |
# ASSIGNMENT | |
x<-5 | |
# ROUNDING | |
floor(5.7) | |
ceiling(5.7) | |
rounded<-function(x)floor(x+0.5) | |
rounded(5.7) | |
# INFINITY AND THINGS THAT ARE NOT A NUMBER | |
3/0 | |
# Inf | |
-12/0 | |
# -Inf | |
exp(-Inf) | |
# 0 | |
0/Inf | |
# 0 | |
(0:3)^Inf | |
# 0:3 generates a series (0,1,2,3) | |
# It produces a vector of answers 0 1 Inf Inf | |
0/0 | |
# NaN (Not a Number) | |
is.finite(10) | |
# TRUE | |
is.infinite(10) | |
# FALSE | |
# MISSING VALUES (NA) | |
x<-c(1:8,NA) | |
mean(x) | |
# this will come up NA | |
# the NA needs to be removed | |
mean(x,na.rm=T) | |
# this removes NA | |
(vmv<-C(1:6,NA,NA,9:12)) | |
# attribute values to 'vmv' | |
seq(along=vmv)[is.na(vmv)] | |
# 'is.na(x)' locates the missing values whithin a vector | |
which(is.na(vmv) | |
# same result as before | |
(vmv[is.na(vmv)]<-0) | |
# this attributes '0' to the NA values | |
vmv<-c(1:6,NA,NA,9:12) | |
ifelse(is.na(vmv),0,vmv) | |
# 'ifelse' does the same | |
# CREATING A VECTOR | |
y<-10:16 | |
y<-(10,11,12,13,14,15,16) | |
y<-scan() | |
# 'scan' manual input through keyboard | |
(counts<-c(25,12,7,4,6,2,1,0,2)) | |
names(counts)<-0:8 | |
st<-table(rpois(2000,2.3)) | |
as.vector(st) | |
# this removes the names | |
data<-read.csv("c=\\MSc\\Stats\\daphnia.csv") | |
attach(data) | |
names(data) | |
tapply(Growth.rate,Detergent,mean) | |
tapply(Growth.rate,list(Water,Daphnia),median) | |
x<-0:10 | |
sum(x) | |
sum(x<5) | |
x<5 | |
1*(x<5) | |
x*(x<5) | |
sum(x*(x<5)) | |
y<-c(8,3,5,7,6,6,8,9,2,3,9,4,10,4,11) | |
sort(y) | |
# sorts in ascending sequence | |
rev(sort(y)) | |
# descending | |
rev(sort(y))[2] | |
rev(sort(y))[1:3] | |
sum(rev(sort(y))[1:3]) | |
y | |
which(y>5) | |
# shows the addresses of the elements with values higher than 5 | |
y[y>5] | |
length(y) | |
length(y[y>5]) | |
# the number of elements in total | |
x<-c(5,8,6,7,1,5,3) | |
z<-x[-1] | |
z | |
trim.mean<-function(x)mean(sort(x)[-c(1,length(x))]) | |
# it first sorts the vector | |
# removes the first element using x[-1] and last using x[-length(x)] | |
# -c(1,length(x)) concatenates both instructions | |
trim.mean(x) | |
# REPEATS | |
rep(9,5) | |
# 99999 | |
rep(1:4,2) | |
# 12341234 | |
rep(1:4,each=2) | |
#11223344 | |
rep(1:4,each=2,times=3) | |
#112233441122334411223344 | |
rep(c(9,15,21,83),c(4,1,4,2)) | |
# 999915212121218383 | |
# GENERATE FACTOR LEVELS | |
gl(4,3) | |
# 'gl' (generate levels | |
# 'gl' (up to, with repeats of, to total length) | |
gl(4,3,24) | |
gl(4,3,20) | |
gl(3,2,24,labels=c("A","B","C")) | |
# AABBCCAABBCCAABBCCAABBCC | |
# Levels: ABC | |
seq(0,1.5,0.2) | |
# we use 'seq' when the interval is not 1.0 | |
# here we want to go from 0 to 1.5 in steps of 0.2 | |
x<-rnorm(18,10,2) | |
seq(88,50,along=x) | |
# 'along' can create a sequence of the same length as an existing vector 'x' | |
# MATRICES | |
X<-matrix(c(1,0,0,0,1,0,0,0,1),nrow=3) | |
# [,1] [,2] [,3] | |
# [1,] 1 0 0 | |
# [2,] 0 1 0 | |
# [3,] 0 0 1 | |
class(X) | |
attributes(X) | |
vector<-c(1,2,3,4,4,3,2,1) | |
V<-matrix(vector,byrow=T,nrow=2) | |
# this allows to convert a vector into a matrix | |
V | |
# [,1] [,2] [,3] [,4] | |
# [1,] 1 2 3 4 | |
# [2,] 4 3 2 1 | |
dim(vector)<-c(4,2) | |
is.matrix(vector) | |
# this is another way to convert a vector into a matrix | |
vector | |
# [,1] [,2] | |
# [1,] 1 4 | |
# [2,] 2 3 | |
# [3,] 3 2 | |
# [4,] 4 1 | |
(vector<-t(vector)) | |
# transpose matrix | |
# CHARACTER STRINGS | |
phrase<- "the quick brown fox jumps over the lazy cow" | |
q<-character(20) | |
for(i in 1:20)q[i]<-substr(phrase,1,i) | |
q | |
# [1] "t" "th" | |
# [3] "the" "the " | |
# [5] "the q" "the qu" | |
# [7] "the qui" "the quic" | |
# [9] "the quick" "the quick " | |
# [11] "the quick b" "the quick br" | |
# [13] "the quick bro" "the quick brow" | |
# [15] "the quick brown" "the quick brown " | |
# [17] "the quick brown f" "the quick brown fo" | |
# [19] "the quick brown fox" "the quick brown fox " | |
strsplit(phrase,split=character(0)) | |
# [[1]] | |
# [1] "t" "h" "e" " " "q" "u" "i" "c" "k" " " "b" "r" "o" | |
# [14] "w" "n" " " "f" "o" "x" " " "j" "u" "m" "p" "s" " " | |
# [27] "o" "v" "e" "r" " " "t" "h" "e" " " "l" "a" "z" "y" | |
# [40] " " "c" "o" "w" | |
table(strsplit(phrase,split=character(0))) | |
# a b c e f h i j k l m n o p q r s t u v w x y z | |
# 8 1 1 2 3 1 2 1 1 1 1 1 1 4 1 1 2 1 2 2 1 2 1 1 1 | |
words<-1+table(strsplit(phrase,split=character(0)))[1] | |
toupper(phrase) | |
tolower(toupper(phrase)) | |
# [1] "THE QUICK BROWN FOX JUMPS OVER THE LAZY COW" | |
# toupper makes the characters upper case | |
# tolower makes them lower case | |
# ARITHMETIC MEAN OF A SINGLE SAMPLE | |
arithmetic.mean<-function(x) sum(x)/length(x) | |
y<-c(3,3,4,5,5) | |
arithmetic.mean(y) | |
mean(y) | |
sort(y)[ceiling(length(y)/2)] | |
med<-function(x){ | |
odd.even<-length(x)%%2 | |
if(odd.evem==0) (sort(x)[length(x)/2]+sort(x)[1+length(x)/2])/2 | |
else sort(x)[ceiling(length(x)/2)] | |
} | |
med(y) | |
med(y[-1]) | |
# LOOPS AND REPEATS | |
for(i in 1:5)print(i^2) | |
binary<-function(x) | |
if(x==0)return(0) | |
i<-0 | |
string<-numeric(32) | |
while(x>0) { | |
string[32-i]<-x%%2 | |
x<-x%/%2 | |
i<-i+1 } | |
first<-match(1,string) | |
string[first:32] | |
sapply(15:17,binary) | |
# THE IFELSE FUNCTION | |
z<-ifelse(y<0, -1,1) | |
# EVALUATING FUNCTIONS WITH APPLY | |
(X<-matrix(1:24,nrow=4)) | |
apply(X,1,sum) | |
# 'apply' is used for applying functions to the rows or columns of matrices | |
# the answer produced by 'apply is a vector rather than a matrix | |
apply(X,1,sqrt) | |
# TESTING FOR EQUALITY | |
x<-sqrt(2) | |
x*x==2 | |
x*x-2 | |
# TESTING AND COERCING IN R | |
as.numeric(factor(c("a","b","c"))) | |
as.numeric(c("a","b","c")) | |
# non-numeric characters cannot be coerced into numbers | |
geometric<-function(x){ | |
if(!is.numeric(x)) stop("Input must be numeric") | |
exp(mean(log(x)))} | |
geometric(c("a","b","c")) | |
geometric<-function(x){ | |
if(!is.numeric(x)) stop("Input must be numeric") | |
if(min(x)<=0) stop ("Input must be greater than zero") | |
exp(mean(log(x)))} | |
geometric(c(2,3,0,4)) | |
geometric(c(10,1000,10,1,1)) | |
# TIME AND DATES | |
Sys.time() | |
class(Sys.time()) | |
time.list<-as.POSIXlt(Sys.time()) | |
class(time.list) | |
unlist(time.list) | |
tim.list$wday | |
time.list$yday | |
Rdate<-strptime(date,"%d/%m/%Y") | |
class(Rdate) | |
tapply(x,Rdate$wday,mean) | |
y<-paste(hrs,min,sec,sep=":") | |
strptime(y,"%T") | |
(Rtime<-as.difftime(y)) | |
tapply(Rtime,experiment,mean) | |
y2<-as.POSIXlt("2018-10-22") | |
y1<-as.POSIXlt("2018-10-22") | |
y2-y1 | |
y3<-as.POSIXlt("2018-10-22 09:30:59") | |
y4<-as.POSIXlt("2018-10-22 12:45:06") | |
y4-y3 | |
difftime("2018-10-22 12:45:06","2018-10-22 09:30:59") | |
y4>y3 | |
# UNDERSTANDIN THE STRUCTURE OF AN R OBJECT USING 'str' | |
x<-runif(23) | |
str(x) | |
basket<-list(rep("a",4),c("b0","b1","b2"),9:4,gl(5,3)) | |
basket | |
str(basket) | |
xv<-seq(0,30) | |
yv<-2+0.5*xv+rnorm(31,0,2) | |
model<-lm(yv~xv+I(xv^2)) | |
str(model) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment