Created
August 14, 2013 12:05
-
-
Save rpietro/6230463 to your computer and use it in GitHub Desktop.
script on data management from a wide variety of sources including dalgaard's book and others
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# reading data into R: http://goo.gl/bAUmj | |
#install.packages("ISwR") | |
library("ISwR") | |
#factors | |
pain <- c(0,3,2,2,1) | |
fpain <- factor(pain,levels=0:3) | |
levels(fpain) <- c("none","mild","medium","severe") | |
fpain | |
# lists | |
intake.pre <- c(5260,5470,5640,6180,6390,6515,6805,7515,7515,8230,8770) | |
intake.post <- c(3910,4220,3885,5160,5645,4680,5265,5975,6790,6900,7335) | |
mylist <- list(before=intake.pre,after=intake.post) | |
mylist | |
mylist$before | |
# data frames | |
d <- data.frame(intake.pre,intake.post) | |
d | |
d$intake.pre | |
# indexing | |
intake.pre[5] | |
intake.pre[c(3,5,7)] | |
v <- c(3,5,7) | |
intake.pre[v] | |
intake.pre[1:5] | |
intake.pre[-c(3,5,7)] | |
intake.post[intake.pre > 7000] | |
intake.post[intake.pre > 7000 & intake.pre <= 8000] | |
intake.pre > 7000 & intake.pre <= 8000 | |
# indexing and dataframes | |
d <- data.frame(intake.pre,intake.post) | |
d[5,1] | |
d[5,] | |
d[d$intake.pre>7000,] | |
sel <- d$intake.pre>7000 | |
sel | |
d[sel,] | |
d[1:2,] | |
head(d) | |
#grouped data and dataframes | |
energy | |
exp.lean <- energy$expend[energy$stature=="lean"] | |
exp.obese <- energy$expend[energy$stature=="obese"] | |
l <- split(energy$expend, energy$stature) | |
l | |
# recoding | |
age <- subset(juul, age >= 10 & age <= 16)$age | |
range(age) | |
agegr <- cut(age, seq(10,16,2), right=F, include.lowest=T) | |
agegr | |
length(age) | |
table(agegr) | |
q <- quantile(age, c(0, .25, .50, .75, 1)) | |
q | |
ageQ <- cut(age, q, include.lowest=T) | |
table(ageQ) | |
levels(ageQ) <- c("1st", "2nd", "3rd", "4th") | |
levels(agegr) <- c("10-11", "12-13", "14-15") | |
table(ageQ) | |
table(agegr) | |
# manipulating factor levels | |
pain <- c(0,3,2,2,1) | |
fpain <- factor(pain,levels=0:3, | |
labels=c("none","mild","medium","severe")) | |
text.pain <- c("none","severe", "medium", "medium", "mild") | |
factor(text.pain) | |
ftpain <- factor(text.pain) | |
ftpain | |
ftpain2 <- factor(ftpain, levels=c("none", "mild", "medium", "severe")) | |
ftpain2 | |
# dates | |
stroke <- read.csv2(system.file("rawdata","stroke.csv", package="ISwR"), na.strings=".") | |
names(stroke) | |
names(stroke) <- tolower(names(stroke)) | |
names(stroke) | |
head(stroke) | |
stroke <- transform(stroke, | |
died = as.Date(died, format="%d.%m.%Y"), | |
dstr = as.Date(dstr, format="%d.%m.%Y")) | |
summary(stroke$died) | |
summary(stroke$dstr) | |
summary(stroke$died - stroke$dstr) | |
head(stroke$died - stroke$dstr) | |
stroke <- transform(stroke, | |
end = pmin(died, as.Date("1996-1-1"), na.rm = T), | |
dead = !is.na(died) & died < as.Date("1996-1-1")) | |
head(stroke) | |
stroke <- transform(stroke, | |
obstime = as.numeric(end - dstr, units="days")/365.25) | |
head(stroke) | |
# data merge and data frame restructuring | |
juulgrl <- subset(juul, sex==2, select=-c(testvol,sex)) | |
head(juulgrl) | |
juulboy <- subset(juul, sex==1, select=-c(menarche,sex)) | |
head(juulboy) | |
juulgrl$sex <- factor("F") | |
juulgrl$testvol <- NA | |
juulboy$sex <- factor("M") | |
juulboy$menarche <- NA | |
juulall <- rbind(juulboy, juulgrl) | |
names(juulall) | |
levels(juulall$sex) | |
#sqldf | |
# Load the package | |
# install.packages("sqldf") | |
# install.packages("PASWR") | |
library(sqldf) | |
# Use the titanic data set | |
data(titanic3, package="PASWR") | |
colnames(titanic3) | |
head(titanic3) | |
sqldf('select age, count(*) from titanic3 where age is not null group by age') | |
library(ggplot2) | |
DF=sqldf('select age from titanic3 where age != "NA"') | |
qplot(DF$age,data=DF, geom="histogram") | |
DF=sqldf('select count(*) total from titanic3 where age=29 group by survived') | |
DF2=t(DF) | |
colnames(DF2)=c('Died','Survived') | |
head(DF2) | |
# character data | |
x = 7 | |
y = 10 | |
cat('x should be greater than y, but x=',x,'and y=',y,'\n') | |
cat('Long strings can','be displayed over', | |
'several lines using','the fill= argument', | |
fill=40) | |
paste('one',2,'three',4,'five') | |
paste(c('one','two','three','four')) | |
paste(c('one','two','three','four'),collapse=' ') | |
paste('X',1:5,sep='') | |
paste(c('X','Y'),1:5,sep='') | |
paste(c('X','Y'),1:5,sep='_',collapse='|') | |
paste(c('X','Y'),1:5,'^',c('a','b'),sep='_',collapse='|') | |
paste(c('X','Y'),1:5,'^',c('a','b'),sep='_') | |
head(state.name) | |
substring(state.name,2,6) | |
mystring = 'dog cat duck' | |
substring(mystring,c(1,5,9),c(3,7,12)) | |
state = 'Mississippi' | |
ll = nchar(state) | |
ll | |
ltrs = substring(state,1:ll,1:ll) | |
ltrs | |
which(ltrs == 's') | |
# loops and repeats | |
for (i in 1:5) print(i^2) | |
j<-k<-0 | |
for (i in 1:5) { | |
j<- j+1 | |
k<-k+i*j | |
print(i+j+k) | |
} | |
fac2<-function(x) { f <- 1 | |
t <- x | |
while(t>1) { | |
f <- f*t | |
t <- t-1 | |
} | |
return(f) | |
} | |
sapply(0:5,fac2) | |
fac3 <- function(x) { | |
f <- 1 | |
t <- x | |
repeat { | |
if (t<2) break | |
f <- f*t | |
t <- t-1 } | |
return(f) } | |
sapply(0:5,fac3) | |
cumprod(1:5) | |
fac4<-function(x) max(cumprod(1:x)) | |
sapply(0:5,fac4) | |
fibonacci<-function(n) { | |
a<-1 | |
b<-0 | |
while(n>0) | |
{swap<-a | |
a<-a+b | |
b<-swap | |
n<-n-1 } | |
b} | |
sapply(1:10,fibonacci) | |
# avoiding loops | |
for (i in 1:length(y)) { if(y[i] < 0) y[i] <- 0 } | |
y [y < 0] <- 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment