[TOC]

New dataframe

First Column as data frame

as.data.frame( df[,1], drop=false)

mydata$newvar <- oldvar

merging

new_writers_df <- merge(writers_df, data2) # problem with column values -> left outer join instead of join
merge(writers_df, data2, all.x=FALSE) # with extra row for non-matching entries
merge(writers_df, data2, all.y=TRUE)

column names differ

merge(writers_df, data2, by.x="Age.At.Death", by.y="Age")

New column

train[, INI := substr(Name, 1, 1)] # new column with first character from column Name

new col = 1 col + 1 col

ds$temp_range <- ds$max_temp - ds$min_temp

transform is neater when adding multiple columns

train <- transform(train, avg_times = mon_since_first_don / no_don)
ds <- transform(ds,
                temp_range=max_temp-min_temp,
                excess=rainfall-evaporation)

set values in another colum

dt[INI == "", named := 0]

conditional

dt[, sex := "Unknown"]
dt[SexuponOutcome %like% "Male", sex := "Male"]
dt[SexuponOutcome %like% "Female", sex := "Female"]

full_imp[is.na(full_imp[, age]), age := .(imp_age)] # set NA values to value from column imp_age

add vector (as column)

mydata <- cbind(mydata, newVector)

Row

writers_df$Location <- c("Belgium", "United Kingdom", "United States", "United Kingdom")


new_row <- c(50, 22, "Roberto", "Bolano", "MALE", "2003-07-15")
writers_df_large <- rbind(writers_df, new_row)
rbindlist(list(dt.train[, -1, with = F], dt.test)) #rbindlist is faster, required two lists. rbindlist(list(DT1,DT2)

add vector (as row)

mydata <- rbind(mydata, newVector)

removing

(NA values)

na.omit(merge) # full range

merge[complete.cases(merge[,2:3]),] # keep columns with values complete in column 2+3

remove rows with missing target

ds <- ds[!is.na(ds[target]),]
sum(is.na(ds[target]))

columns

ds$excess <- NULL

Imputing

ds[vars] <- na.roughfix(ds[vars])
sum(is.na(ds[vars]))

Removing data

columns

writers_df[1,3] <- NULL

value

Age.At.Death <- NULL

rows

rows_to_keep <- c(TRUE, FALSE, TRUE, FALSE)
limited_writers_df <- writers_df[rows_to_keep,]

conditional

fourty_sth_writers <- writers_df[writers_df$Age.At.Death > 40,]

Reshaping

Stack: concatenate/combine multiple vectors -> single vector

##   Subject Gender Read Write Listen
## 1       1      M   10     8      7
## 2       2      F    7     4      6
long_format <- stack(observations_wide,
                     select=c(Read,
                              Write,
                              Listen))
##   values    ind
## 1     10   Read
## 2      7   Read
## 3      8  Write
## 4      4  Write
## 5      7 Listen
#----------------------------------------

Stack: concatenate/combine multiple vectors -> single vector

##   Subject Gender   Test Result
## 1       1      M   Read     10
## 2       2      F  Write      4
## 3       1      M  Write      8
## 4       2      F Listen      6
## 5       2      F   Read      7
unstack(observations_long,
                       Result ~ Test)
##   Listen Read Write
## 1      6   10     4
## 2      7    7     8
#----------------------------------------

Reshaping dataframes

with reshape

with tidyr

Dates

dt[, DateTime:= ymd_hms(DateTime)]
dt[, time := (lubridate::hour(DateTime) + minute(DateTime)/60)]
dt[, year := factor(lubridate::year(DateTime))]
dt[, month := factor(lubridate::month(DateTime))]
dt[, day := as.numeric(lubridate::day(DateTime))]
dt[, weekday := lubridate::wday(DateTime)]

Converting

Columns

names(writers_df) <- c("Age.At.Death", "Age.As.Writer", "Name", "Surname", "Gender", "Death")
colnames(writers_df) = c("Age.At.Death", "Age.As.Writer", "Name", "Surname", "Gender", "Death")
rownames(writers_df) = c("ID1", "ID2", "ID3", "ID4")

List/matrix -> dataframe

A = matrix(c(2, 4, 3, 1, 5, 7), nrow=2, ncol=3, byrow = TRUE)
A_df <- as.data.frame(A)

dataframe -> matrix/list

writers_matrix <- as.matrix(writers_df)
writers_list <- as.list(writers_df)

year/month/dat into days

dt[AgeuponOutcome == "", AgeuponOutcome := "unknown unknown"]
parsed_age <- do.call(rbind, sapply(dt[, AgeuponOutcome], strsplit, " "))
dt[, c("num", "unit") := .(parsed_age[, 1], parsed_age[, 2])]
dt[unit %like% "year", age := as.numeric(num)*365]
dt[unit %like% "month", age := as.numeric(num)*30.5]
dt[unit %like% "week", age := as.numeric(num)*7]
dt[unit %like% "day", age := as.numeric(num)]
dt[unit == "unknown", age := NA]

2007-11-01 -> "2007-11-01 UTC"

library(lubridate)
ds$date <- ymd(as.character(ds$date))

make target factor

ds[target] <- as.factor(ds[[target]])

p <- ggplot(ds, aes_string(x=target))
p <- p + geom_bar(width=0.2)
print(p)

fabsta/3. feature engineering (R data science).md

New dataframe

First Column as data frame

merging

column names differ

New column

new col = 1 col + 1 col

transform is neater when adding multiple columns

set values in another colum

conditional

add vector (as column)

Row

add vector (as row)

removing

(NA values)

remove rows with missing target

columns

Imputing

Removing data

columns

value

rows

conditional

Reshaping

Stack: concatenate/combine multiple vectors -> single vector

Stack: concatenate/combine multiple vectors -> single vector

Reshaping dataframes

with reshape

with tidyr

Dates

Converting

Columns

List/matrix -> dataframe

dataframe -> matrix/list

year/month/dat into days

2007-11-01 -> "2007-11-01 UTC"

make target factor