Last active
December 15, 2016 09:48
-
-
Save n8thangreen/3c1f746867390f2dd561951751988113 to your computer and use it in GitHub Desktop.
Imperial College London MSc Public Health practical
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ----setup, include=FALSE------------------------------------------------ | |
knitr::opts_chunk$set(echo = TRUE) | |
## ---- eval=T,echo=T------------------------------------------------------ | |
a <- 1 | |
b <- 2 | |
c <- -1 | |
d <- 2 | |
# Check if two values are the same. Note the difference between | |
# == and a single = | |
a == b | |
# Check if a value is greater than another | |
# Note that you can store the result to a variable | |
result_1 <- d > c | |
print(result_1) | |
# Check if value less than or equal to | |
b <= d | |
# Check if a value does not equal another | |
c != b | |
# This also works with other simple data types, including chars | |
test_1 <- "apples" | |
test_2 <- "oranges" | |
test_1 != test_2 | |
## ---- echo=T,eval=T------------------------------------------------------ | |
# Create some fictional data | |
person_names <- c("frodo","samwise","peregrin","meriadoc","aragorn", | |
"gandalf","legolas","gimli","boromir") | |
races <- as.factor(c(rep("hobbit",4),"man","maia","elf","dwarf","man")) | |
# Note the use of `rep` to repeat hobbit 4 times | |
ages <- c(50,38,28,36,87,5000,500,139,40) | |
# Logical subsetting of a vector: | |
# Create a new variable which indicates whether or not each individual observation | |
# is greater than 50 | |
indices <- ages > 50 | |
print(indices) | |
# This prints the variable "person_names" when indices = TRUE, | |
# so when age > 50 | |
print(person_names[indices]) | |
# Or do this directly | |
print(ages[ages <= 50]) | |
print(person_names[races == "hobbit"]) | |
# You can also find the actual indices using the `which()` function | |
# This lists the rows in the dataset which satisfy the condition | |
# age <= 50 | |
print(which(ages <= 50)) | |
# Let's try subsetting a data frame. Note the argument to not automatically convert | |
# strings to factors when declaring the data frame: | |
fellowship <- data.frame("name"=person_names,"race"=races,"age"=ages,stringsAsFactors=FALSE) | |
# Note that we have to subset the correct column of the data frame with | |
# `fellowship$race`. This is essentially saying "find all rows of the | |
# fellowship$race vector that equal "hobbit", and use these to print | |
# only the corresponding rows of the fellowship data frame" | |
# Note the use of the comma to indicate that we want all columns | |
print(fellowship[fellowship$race=="hobbit",]) | |
# The following commented out line WON'T work: | |
print(fellowship["race"=="hobbit",]) | |
## ---- eval=T,echo=T------------------------------------------------------ | |
print(1 < 5 & 1 > 0) | |
# You can use this to subset data as well: | |
young_hobbits <- fellowship[fellowship$race == "hobbit" & fellowship$age < 40,] | |
print(young_hobbits) | |
## ----echo=T,eval=T------------------------------------------------------- | |
# Notice that by providing a data frame or matrix with more than | |
# two columns, the plot function plots each column against | |
# every other column by default | |
melanoma_data <- read.csv(file="data/melanoma.csv",header=TRUE) | |
print(head(melanoma_data)) | |
plot(melanoma_data) | |
# Plotting only specific columns is done by indexing | |
# Note the row index is blank here, so all rows are used | |
# but only the two specified columns | |
plot(melanoma_data[,c("year","incidence")]) | |
## ---- echo=T,eval=T------------------------------------------------------ | |
# Plot the same data but with a number of optional arguments. | |
# Notice that I can spread the function call over multiple | |
# lines as long as they end in a comma. | |
plot(melanoma_data[,c("year","incidence")], | |
xlab="Time in years", | |
ylab="Incidence" | |
) | |
## ---- echo=T,eval=T------------------------------------------------------ | |
# Create a boxplot comparing data summaries between two | |
# time points | |
cd4_data <- read.csv(file="data/cd4.csv",header=TRUE) | |
boxplot(cd4_data[,c("baseline","oneyear")], | |
ylab="CD4 cell counts (in 100's)") | |
# Create a histogram to look at distribution of data | |
hist(cd4_data[,"baseline"], | |
main="Histogram of CD4 Cell count at Baseline", | |
ylab="Distribution of cell counts at baseline", | |
xlab="CD4 cell count (100's)") | |
## ---- echo=T,eval=T------------------------------------------------------ | |
# Scatter plot of melanoma incidence (y) against year (x). Note the | |
# additional arguments for plot labels | |
plot(melanoma_data$year, melanoma_data$incidence, | |
main="Melanoma incidence over time", | |
xlab="Time in years", | |
ylab="Incidence" | |
) | |
# We use the `lines` function to add a line to the plot. The `lowess` function | |
# produces a smoothing line for the relationship between year and incidence, | |
# which we then add to the plot. Note that we have also used the `col` argument | |
# to change the line colour. Have a look at calling the lowess function without | |
# the lines function to see what it returns. | |
lines(lowess(melanoma_data$year,melanoma_data$incidence), | |
col="red") | |
## ---- echo=T,eval=T,error=TRUE,warning=TRUE------------------------------ | |
# An example of a warning message. Note that NaN stands for not a number, | |
# meaning that the result doesn't make any mathematical sense | |
log(-2) | |
# A few examples of error messages - note that the messages are somewhat informative | |
exp("candy") | |
rnorm(NULL) | |
a[1,1] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment