n8thangreen · December 15, 2016 09:48
diff --git a/R introduction - Lecture 2.R b/R introduction - Lecture 2.R
 ## ----setup, include=FALSE------------------------------------------------
 knitr::opts_chunk$set(echo = TRUE)

 ## ---- eval=T,echo=T------------------------------------------------------
 a <- 1
 b <- 2
 c <- -1
 d <- 2

 # Check if two values are the same. Note the difference between 
 # == and a single =
 a == b

 # Check if a value is greater than another
 # Note that you can store the result to a variable
 result_1 <- d > c
 print(result_1)

 # Check if value less than or equal to
 b <= d

 # Check if a value does not equal another
 c != b

 # This also works with other simple data types, including chars
 test_1 <- "apples"
 test_2 <- "oranges"

 test_1 != test_2

 ## ---- echo=T,eval=T------------------------------------------------------
 # Create some fictional data
 person_names <- c("frodo","samwise","peregrin","meriadoc","aragorn",
                  "gandalf","legolas","gimli","boromir")
 races <- as.factor(c(rep("hobbit",4),"man","maia","elf","dwarf","man")) 
 # Note the use of `rep` to repeat hobbit 4 times
 ages <- c(50,38,28,36,87,5000,500,139,40)

 # Logical subsetting of a vector:
 # Create a new variable which indicates whether or not each individual observation
 # is greater than 50
 indices <- ages > 50
 print(indices)

 # This prints the variable "person_names" when indices = TRUE, 
 # so when age > 50
 print(person_names[indices])

 # Or do this directly
 print(ages[ages <= 50])
 print(person_names[races == "hobbit"])

 # You can also find the actual indices using the `which()` function
 # This lists the rows in the dataset which satisfy the condition
 # age <= 50
 print(which(ages <= 50))

 # Let's try subsetting a data frame. Note the argument to not automatically convert 
 # strings to factors when declaring the data frame:
 fellowship <- data.frame("name"=person_names,"race"=races,"age"=ages,stringsAsFactors=FALSE)

 # Note that we have to subset the correct column of the data frame with 
 # `fellowship$race`. This is essentially saying "find all rows of the 
 # fellowship$race vector that equal "hobbit", and use these to print 
 # only the corresponding rows of the fellowship data frame"
 # Note the use of the comma to indicate that we want all columns
 print(fellowship[fellowship$race=="hobbit",])

 # The following commented out line WON'T work:
 print(fellowship["race"=="hobbit",])

 ## ---- eval=T,echo=T------------------------------------------------------
 print(1 < 5 & 1 > 0)

 # You can use this to subset data as well:
 young_hobbits <- fellowship[fellowship$race == "hobbit" & fellowship$age < 40,]
 print(young_hobbits)

 ## ----echo=T,eval=T-------------------------------------------------------
 # Notice that by providing a data frame or matrix with more than
 # two columns, the plot function plots each column against
 # every other column by default
 melanoma_data <- read.csv(file="data/melanoma.csv",header=TRUE)
 print(head(melanoma_data))
 plot(melanoma_data)

 # Plotting only specific columns is done by indexing
 # Note the row index is blank here, so all rows are used
 # but only the two specified columns
 plot(melanoma_data[,c("year","incidence")])

 ## ---- echo=T,eval=T------------------------------------------------------
 # Plot the same data but with a number of optional arguments.
 # Notice that I can spread the function call over multiple 
 # lines as long as they end in a comma.
 plot(melanoma_data[,c("year","incidence")], 
     xlab="Time in years", 
     ylab="Incidence"
     )

 ## ---- echo=T,eval=T------------------------------------------------------
 # Create a boxplot comparing data summaries between two
 # time points
 cd4_data <- read.csv(file="data/cd4.csv",header=TRUE)
 boxplot(cd4_data[,c("baseline","oneyear")], 
        ylab="CD4 cell counts (in 100's)")

 # Create a histogram to look at distribution of data
 hist(cd4_data[,"baseline"],
     main="Histogram of CD4 Cell count at Baseline",
     ylab="Distribution of cell counts at baseline",
     xlab="CD4 cell count (100's)")


 ## ---- echo=T,eval=T------------------------------------------------------
 # Scatter plot of melanoma incidence (y) against year (x). Note the 
 # additional arguments for plot labels
 plot(melanoma_data$year, melanoma_data$incidence,
     main="Melanoma incidence over time",
     xlab="Time in years", 
     ylab="Incidence"
     )
 # We use the `lines` function to add a line to the plot. The `lowess` function 
 # produces a smoothing line for the relationship between year and incidence,
 # which we then add to the plot. Note that we have also used the `col` argument 
 # to change the line colour. Have a look at calling the lowess function without
 # the lines function to see what it returns.
 lines(lowess(melanoma_data$year,melanoma_data$incidence),
      col="red")

 ## ---- echo=T,eval=T,error=TRUE,warning=TRUE------------------------------
 # An example of a warning message. Note that NaN stands for not a number,
 # meaning that the result doesn't make any mathematical sense
 log(-2)

 # A few examples of error messages - note that the messages are somewhat informative
 exp("candy")
 rnorm(NULL)
 a[1,1]
	## ----setup, include=FALSE------------------------------------------------
	knitr::opts_chunk$set(echo = TRUE)

	## ---- eval=T,echo=T------------------------------------------------------
	a <- 1
	b <- 2
	c <- -1
	d <- 2

	# Check if two values are the same. Note the difference between
	# == and a single =
	a == b

	# Check if a value is greater than another
	# Note that you can store the result to a variable
	result_1 <- d > c
	print(result_1)

	# Check if value less than or equal to
	b <= d

	# Check if a value does not equal another
	c != b

	# This also works with other simple data types, including chars
	test_1 <- "apples"
	test_2 <- "oranges"

	test_1 != test_2

	## ---- echo=T,eval=T------------------------------------------------------
	# Create some fictional data
	person_names <- c("frodo","samwise","peregrin","meriadoc","aragorn",
	"gandalf","legolas","gimli","boromir")
	races <- as.factor(c(rep("hobbit",4),"man","maia","elf","dwarf","man"))
	# Note the use of `rep` to repeat hobbit 4 times
	ages <- c(50,38,28,36,87,5000,500,139,40)

	# Logical subsetting of a vector:
	# Create a new variable which indicates whether or not each individual observation
	# is greater than 50
	indices <- ages > 50
	print(indices)

	# This prints the variable "person_names" when indices = TRUE,
	# so when age > 50
	print(person_names[indices])

	# Or do this directly
	print(ages[ages <= 50])
	print(person_names[races == "hobbit"])

	# You can also find the actual indices using the `which()` function
	# This lists the rows in the dataset which satisfy the condition
	# age <= 50
	print(which(ages <= 50))

	# Let's try subsetting a data frame. Note the argument to not automatically convert
	# strings to factors when declaring the data frame:
	fellowship <- data.frame("name"=person_names,"race"=races,"age"=ages,stringsAsFactors=FALSE)

	# Note that we have to subset the correct column of the data frame with
	# `fellowship$race`. This is essentially saying "find all rows of the
	# fellowship$race vector that equal "hobbit", and use these to print
	# only the corresponding rows of the fellowship data frame"
	# Note the use of the comma to indicate that we want all columns
	print(fellowship[fellowship$race=="hobbit",])

	# The following commented out line WON'T work:
	print(fellowship["race"=="hobbit",])

	## ---- eval=T,echo=T------------------------------------------------------
	print(1 < 5 & 1 > 0)

	# You can use this to subset data as well:
	young_hobbits <- fellowship[fellowship$race == "hobbit" & fellowship$age < 40,]
	print(young_hobbits)

	## ----echo=T,eval=T-------------------------------------------------------
	# Notice that by providing a data frame or matrix with more than
	# two columns, the plot function plots each column against
	# every other column by default
	melanoma_data <- read.csv(file="data/melanoma.csv",header=TRUE)
	print(head(melanoma_data))
	plot(melanoma_data)

	# Plotting only specific columns is done by indexing
	# Note the row index is blank here, so all rows are used
	# but only the two specified columns
	plot(melanoma_data[,c("year","incidence")])

	## ---- echo=T,eval=T------------------------------------------------------
	# Plot the same data but with a number of optional arguments.
	# Notice that I can spread the function call over multiple
	# lines as long as they end in a comma.
	plot(melanoma_data[,c("year","incidence")],
	xlab="Time in years",
	ylab="Incidence"
	)

	## ---- echo=T,eval=T------------------------------------------------------
	# Create a boxplot comparing data summaries between two
	# time points
	cd4_data <- read.csv(file="data/cd4.csv",header=TRUE)
	boxplot(cd4_data[,c("baseline","oneyear")],
	ylab="CD4 cell counts (in 100's)")

	# Create a histogram to look at distribution of data
	hist(cd4_data[,"baseline"],
	main="Histogram of CD4 Cell count at Baseline",
	ylab="Distribution of cell counts at baseline",
	xlab="CD4 cell count (100's)")


	## ---- echo=T,eval=T------------------------------------------------------
	# Scatter plot of melanoma incidence (y) against year (x). Note the
	# additional arguments for plot labels
	plot(melanoma_data$year, melanoma_data$incidence,
	main="Melanoma incidence over time",
	xlab="Time in years",
	ylab="Incidence"
	)
	# We use the `lines` function to add a line to the plot. The `lowess` function
	# produces a smoothing line for the relationship between year and incidence,
	# which we then add to the plot. Note that we have also used the `col` argument
	# to change the line colour. Have a look at calling the lowess function without
	# the lines function to see what it returns.
	lines(lowess(melanoma_data$year,melanoma_data$incidence),
	col="red")

	## ---- echo=T,eval=T,error=TRUE,warning=TRUE------------------------------
	# An example of a warning message. Note that NaN stands for not a number,
	# meaning that the result doesn't make any mathematical sense
	log(-2)

	# A few examples of error messages - note that the messages are somewhat informative
	exp("candy")
	rnorm(NULL)
	a[1,1]