Skip to content

Instantly share code, notes, and snippets.

@fabsta
Last active July 21, 2016 09:29
Show Gist options
  • Save fabsta/de0a0d8e5adab3ebbc27381dea1a4ccc to your computer and use it in GitHub Desktop.
Save fabsta/de0a0d8e5adab3ebbc27381dea1a4ccc to your computer and use it in GitHub Desktop.

[TOC]

New dataframe

First Column as data frame

as.data.frame( df[,1], drop=false)

mydata$newvar <- oldvar

merging

new_writers_df <- merge(writers_df, data2) # problem with column values -> left outer join instead of join
merge(writers_df, data2, all.x=FALSE) # with extra row for non-matching entries
merge(writers_df, data2, all.y=TRUE)

column names differ

merge(writers_df, data2, by.x="Age.At.Death", by.y="Age")

New column

train[, INI := substr(Name, 1, 1)] # new column with first character from column Name

new col = 1 col + 1 col

ds$temp_range <- ds$max_temp - ds$min_temp

transform is neater when adding multiple columns

train <- transform(train, avg_times = mon_since_first_don / no_don)
ds <- transform(ds,
                temp_range=max_temp-min_temp,
                excess=rainfall-evaporation)

set values in another colum

dt[INI == "", named := 0] 

conditional

dt[, sex := "Unknown"]
dt[SexuponOutcome %like% "Male", sex := "Male"]
dt[SexuponOutcome %like% "Female", sex := "Female"]

full_imp[is.na(full_imp[, age]), age := .(imp_age)] # set NA values to value from column imp_age

add vector (as column)

mydata <- cbind(mydata, newVector)

Row

writers_df$Location <- c("Belgium", "United Kingdom", "United States", "United Kingdom")


new_row <- c(50, 22, "Roberto", "Bolano", "MALE", "2003-07-15")
writers_df_large <- rbind(writers_df, new_row)
rbindlist(list(dt.train[, -1, with = F], dt.test)) #rbindlist is faster, required two lists. rbindlist(list(DT1,DT2) 

add vector (as row)

mydata <- rbind(mydata, newVector)

removing

(NA values)

na.omit(merge) # full range

merge[complete.cases(merge[,2:3]),] # keep columns with values complete in column 2+3

remove rows with missing target

ds <- ds[!is.na(ds[target]),]
sum(is.na(ds[target]))

columns

ds$excess <- NULL

Imputing

ds[vars] <- na.roughfix(ds[vars])
sum(is.na(ds[vars]))

Removing data

columns

writers_df[1,3] <- NULL

value

Age.At.Death <- NULL

rows

rows_to_keep <- c(TRUE, FALSE, TRUE, FALSE)
limited_writers_df <- writers_df[rows_to_keep,]

conditional

fourty_sth_writers <- writers_df[writers_df$Age.At.Death > 40,]

Reshaping

Stack: concatenate/combine multiple vectors -> single vector

##   Subject Gender Read Write Listen
## 1       1      M   10     8      7
## 2       2      F    7     4      6
long_format <- stack(observations_wide,
                     select=c(Read,
                              Write,
                              Listen))
##   values    ind
## 1     10   Read
## 2      7   Read
## 3      8  Write
## 4      4  Write
## 5      7 Listen
#----------------------------------------

Stack: concatenate/combine multiple vectors -> single vector

##   Subject Gender   Test Result
## 1       1      M   Read     10
## 2       2      F  Write      4
## 3       1      M  Write      8
## 4       2      F Listen      6
## 5       2      F   Read      7
unstack(observations_long,
                       Result ~ Test)
##   Listen Read Write
## 1      6   10     4
## 2      7    7     8
#----------------------------------------

Reshaping dataframes

with reshape

with tidyr

Dates

dt[, DateTime:= ymd_hms(DateTime)]
dt[, time := (lubridate::hour(DateTime) + minute(DateTime)/60)]
dt[, year := factor(lubridate::year(DateTime))]
dt[, month := factor(lubridate::month(DateTime))]
dt[, day := as.numeric(lubridate::day(DateTime))]
dt[, weekday := lubridate::wday(DateTime)]

Converting

Columns

names(writers_df) <- c("Age.At.Death", "Age.As.Writer", "Name", "Surname", "Gender", "Death")
colnames(writers_df) = c("Age.At.Death", "Age.As.Writer", "Name", "Surname", "Gender", "Death")
rownames(writers_df) = c("ID1", "ID2", "ID3", "ID4")

List/matrix -> dataframe

A = matrix(c(2, 4, 3, 1, 5, 7), nrow=2, ncol=3, byrow = TRUE)
A_df <- as.data.frame(A)

dataframe -> matrix/list

writers_matrix <- as.matrix(writers_df)
writers_list <- as.list(writers_df)

year/month/dat into days

dt[AgeuponOutcome == "", AgeuponOutcome := "unknown unknown"]
parsed_age <- do.call(rbind, sapply(dt[, AgeuponOutcome], strsplit, " "))
dt[, c("num", "unit") := .(parsed_age[, 1], parsed_age[, 2])]
dt[unit %like% "year", age := as.numeric(num)*365]
dt[unit %like% "month", age := as.numeric(num)*30.5]
dt[unit %like% "week", age := as.numeric(num)*7]
dt[unit %like% "day", age := as.numeric(num)]
dt[unit == "unknown", age := NA]

2007-11-01 -> "2007-11-01 UTC"

library(lubridate)
ds$date <- ymd(as.character(ds$date))

make target factor

ds[target] <- as.factor(ds[[target]])

p <- ggplot(ds, aes_string(x=target))
p <- p + geom_bar(width=0.2)
print(p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment