Last active
February 7, 2018 07:31
-
-
Save meefen/7454d2fc8b9bc821c12a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Frequent Sequence Mining | |
======================================================== | |
```{r knitr-options} | |
library(knitr) | |
options(width=200, digits=2) | |
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, tidy = FALSE, size="small") | |
``` | |
## Data preparation | |
Convert data into FSM format | |
```{r} | |
# Read data | |
dft <- read.csv("threads.csv") | |
dfc <- read.csv("coding.csv") | |
dft$grade <- factor(dft$grade) | |
dfc$coding <- factor(dfc$coding, levels=c("Q","T","OE","WE","SY","S")) | |
# Reformat dfc | |
dfc$sequenceID <- as.numeric(dfc$thread) | |
dfc$eventID <- dfc$sequenceID * 100000 + dfc$id | |
dfc$size <- 1 | |
dfc$items <- dfc$coding | |
# Partition by thread type | |
t_e <- dft$thread[which(dft$type == "e")] | |
t_i <- dft$thread[which(dft$type == "i")] | |
dfc_e <- subset(dfc, thread %in% t_e, select=c(sequenceID, eventID, size, items)) | |
dfc_i <- subset(dfc, thread %in% t_i, select=c(sequenceID, eventID, size, items)) | |
dfc_e <- dfc_e[with(dfc_e, order(eventID)), ] | |
dfc_i <- dfc_i[with(dfc_i, order(eventID)), ] | |
# Save files | |
basket_files <- c("effective", "improvable") | |
write.table(dfc_e, file=basket_files[1], quote=FALSE, row.names=FALSE, col.names=FALSE) | |
write.table(dfc_i, file=basket_files[2], quote=FALSE, row.names=FALSE, col.names=FALSE) | |
``` | |
## Frequent sequence mining | |
```{r} | |
library(arulesSequences) | |
source("utils.R") # Load functions | |
# mine frequent sequences | |
fs_all <- data.frame(group=character(), # empty df for frequent sequences | |
sequence=character(), support=character()) | |
r_all <- data.frame(group=character(), # empty df for rules | |
rule=character(), support=character(), | |
confidence=character(), lift=character()) | |
for(file in basket_files) { | |
bskt <- read_baskets(file, info=c("sequenceID","eventID","SIZE")) # read baskets | |
# mine frequent sequences | |
fs <- cspade(bskt, parameter = list(support = 0.1, maxlen=4, maxgap=2), | |
control = list(verbose = TRUE, bfstype=TRUE)) | |
fs_df <- cbind(group=file, as(fs, "data.frame")) | |
fs_all <- rbind(fs_all, fs_df) | |
# select interesting sequences / rules | |
rules <- ruleInduction(fs, confidence = 0.6, # induct rules >= a confidence level | |
control = list(verbose = TRUE)) | |
rules.df <- subset(as(rules, "data.frame"), lift > 1.2) # filter by lift | |
if(nrow(rules.df) > 0) | |
r_all <- rbind(r_all, cbind(group=file, rules.df)) | |
} | |
## Check results | |
str(fs_all) | |
## Get single-event sequences | |
fs_single <- fs_all[!grepl("\\},\\{", fs_all$sequence), ] | |
fs_single_pretty <- Prettify(fs_single) | |
write.csv(fs_single_pretty, file="fs_single_pretty.csv", row.names=FALSE) | |
# Get multi-event sequences | |
fs_multi <- fs_all[grep("\\},\\{", fs_all$sequence), ] | |
fs_multi_pretty <- Prettify(fs_multi) | |
write.csv(fs_multi_pretty, file="fs_multi_pretty.csv", row.names=FALSE) | |
``` | |
## Interpret results | |
```{r} | |
Differentiate(fs_single_pretty, basket_files, 0.05) # compare single | |
Differentiate(fs_multi_pretty, basket_files, 0.2) # compare multiple | |
Differentiate(fs_multi_pretty, basket_files, 0.1, 0.2) # compare multiple | |
``` | |
Induct rules | |
```{r} | |
r_all[with(r_all, order(group, -lift)), ] | |
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Utils for FSM | |
Prettify <- function(df) { | |
## Function to prettify fs df | |
sequences <- as.character(unique(df$sequence)) | |
len <- length(sequences) | |
df_pretty <- data.frame(sequence = sequences) | |
for(file in unique(df$group)) { | |
sub <- df[df$group == file, ] | |
vec <- rep(NA, len) | |
vec[match(sub$sequence, sequences)] <- sub$support | |
df_pretty <- cbind(df_pretty, vec) | |
names(df_pretty)[ncol(df_pretty)] <- file | |
} | |
return(df_pretty) | |
} | |
ClearStandloneEvents <- function(df) { | |
## Function to clear standalone video events | |
df[!grepl("\\{(pause|out-of-sequence|seeked|short-event| | |
immediate-review|ratechange|seen.before)\\}", df$sequence), ] | |
} | |
FilterSequences <- function(df, threshold = 0.0) { | |
## Function to filter the frequence/support of identified sequences | |
df[df$support >= threshold, ] | |
} | |
Differentiate <- function(fs, groups=NULL, diff_min=0.05, diff_max=NULL) { | |
### Function to find differences among specified groups, | |
### in terms of frequent sequences | |
### | |
### Parameters: | |
### fs: data.frame (prettifed) containing data -- row: sequences; col: groups | |
### groups: character vector specifying groups to compare | |
### diff: threshold of difference | |
sub <- subset(fs, select=c("sequence", groups)) | |
choose <- apply(sub[, groups], 1, function(x) { | |
if(sum(!is.na(x)) == 0) return(FALSE) | |
if(is.null(diff_max)) | |
diff(range(x, na.rm=TRUE)) > diff_min | |
else | |
diff(range(x, na.rm=TRUE)) > diff_min & diff(range(x, na.rm=TRUE)) < diff_max | |
}) | |
fs.d <- sub[choose, ] | |
fs.d$diff <- fs.d[, 2] - fs.d[, 3] | |
fs.d[order(-fs.d[ , 4]),] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment