Last active
December 30, 2015 06:19
-
-
Save tomasgreif/7788946 to your computer and use it in GitHub Desktop.
Create OLAP cube for Saiku from any data frame
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
GenerateCubeSQLite <- function(DataFrame,PrimaryKey,CubeDestination,DataSourceDestination,NumericAggregators=NA,DateAggregators=NA,MaxTime='2015-01-01',MinTime='2001-01-01') { | |
# Function to clean names | |
standardize_name <- function(string) { | |
capped <- grep("^[^A-Z]*$", string, perl = TRUE) | |
substr(string[capped], 1, 1) <- toupper(substr(string[capped], 1, 1)) | |
string <- gsub('_',' ',string, fixed=TRUE) | |
return(string) | |
} | |
# --------- Define connection | |
library(sqldf) | |
library("RSQLite") | |
file.remove('__this_is_temporary_db__.db') | |
sqlite <- dbDriver("SQLite") | |
tmpdb <- dbConnect(sqlite,"__this_is_temporary_db__.db") | |
DataFrameName <- deparse(substitute(DataFrame)) | |
TableName <- DataFrameName | |
dbWriteTable(tmpdb,DataFrameName,DataFrame) | |
#------ Generate time dimension | |
ddiff <- as.integer(as.Date(MaxTime)) - as.integer(as.Date(MinTime)) | |
tmp_time <- data.frame(time_date = as.Date(MinTime) + seq(1:ddiff)) | |
tmp_time$month_number <- as.integer(format(tmp_time$time_date, "%m")) | |
tmp_time$year_number <- as.integer(format(tmp_time$time_date, "%Y")) | |
tmp_time$quarter_number[tmp_time$month_number %in% 1:3] <- 1 | |
tmp_time$quarter_number[tmp_time$month_number %in% 4:6] <- 2 | |
tmp_time$quarter_number[tmp_time$month_number %in% 7:9] <- 3 | |
tmp_time$quarter_number[tmp_time$month_number %in% 10:12] <- 4 | |
TimeTableName <- 'tmp_time' | |
dbWriteTable(tmpdb,'tmp_time',tmp_time) | |
#------ Disconnect | |
dbDisconnect(tmpdb) | |
# -------- Get column types from R data frame | |
TableDesign <- data.frame(type=sapply(DataFrame, typeof), datetype=sapply(names(DataFrame), function(x) inherits(DataFrame[[x]], "Date"))) | |
TableDesign$column_name <- row.names(TableDesign) | |
row.names(TableDesign) <- NULL | |
TableDesign <- sqldf("select column_name, case when datetype then 'date' else type end as data_type from TableDesign", drv='SQLite') | |
# -------- Get lists of dimensions, time dimensions, measures | |
DimensionVariables <- TableDesign[TableDesign$data_type %in% c('character','integer'),1] | |
DimensionVariablesNames <- standardize_name(DimensionVariables) | |
VariablesNumeric <- sort(TableDesign[TableDesign$data_type %in% c('numeric','integer','double'),1]) | |
VariablesNumeric <- VariablesNumeric[!(VariablesNumeric %in% PrimaryKey)] | |
VariablesDate <- TableDesign[TableDesign$data_type %in% c('date'),1] | |
VariablesDateNames <- standardize_name(VariablesDate) | |
#----------- Define aggregators for different types of variables | |
Aggregators <- c('sum','count','min','max','avg','distinct-count') | |
AggregatorsNames <- c('Sum','Cnt','Min','Max','Avg','Dcnt') | |
# Numeric Aggregators | |
if (!all(is.na(NumericAggregators))) { | |
NumericAggregators <- Aggregators[Aggregators %in% NumericAggregators] | |
NumericAggregatorsNames <- AggregatorsNames[Aggregators %in% NumericAggregators] | |
} else { | |
NumericAggregators <- Aggregators | |
NumericAggregatorsNames <- AggregatorsNames | |
} | |
# Date Aggregators | |
if (!all(is.na(DateAggregators))) { | |
DateAggregators <- Aggregators[Aggregators %in% DateAggregators] | |
DateAggregatorsNames <- AggregatorsNames[Aggregators %in% DateAggregators] | |
} else { | |
DateAggregators <- 'count' | |
DateAggregatorsNames <- '#' | |
} | |
#---------- Define Dimensions | |
# Initialize empty vectors | |
DimensionsDate <- character(0) | |
DimensionsVarchar <- character(0) | |
# Time dimension | |
if (length(VariablesDate) > 0) { | |
DimensionsDate <- paste0(' | |
<Dimension type="TimeDimension" visible="true" foreignKey="',VariablesDate,'" highCardinality="false" name="',VariablesDateNames,'"> | |
<Hierarchy name="Time Hierarchy" visible="true" hasAll="true" primaryKey="time_date"> | |
<Table name="',TimeTableName,'"></Table> | |
<Level name="Year" visible="true" column="year_number" ordinalColumn="year_number" type="Integer" internalType="int" uniqueMembers="false" levelType="TimeYears" hideMemberIf="Never"> | |
</Level> | |
<Level name="Quarter" visible="true" column="quarter_number" ordinalColumn="quarter_number" type="Integer" uniqueMembers="false" levelType="TimeQuarters" hideMemberIf="Never"> | |
</Level> | |
<Level name="Month" visible="true" column="month_number" ordinalColumn="month_number" type="Integer" uniqueMembers="false" levelType="TimeMonths" hideMemberIf="Never"> | |
</Level> | |
</Hierarchy> | |
</Dimension>',collapse='') | |
} | |
# Other dimensions | |
if(length(DimensionVariables) > 0) { | |
DimensionsVarchar <- paste(' | |
<Dimension type="StandardDimension" visible="true" foreignKey="',PrimaryKey,'" highCardinality="false" name="',DimensionVariablesNames,'"> | |
<Hierarchy name="',DimensionVariablesNames,'" visible="true" hasAll="true" primaryKey="',PrimaryKey,'"> | |
<Table name="',TableName,'"> | |
</Table> | |
<Level name="',DimensionVariablesNames,'" visible="true" column="',DimensionVariables,'" type="String" uniqueMembers="false" levelType="Regular" hideMemberIf="Never"> | |
</Level> | |
</Hierarchy> | |
</Dimension>',sep='', collapse='') | |
} | |
# Merge All dimensions togetgher | |
Dimensions <- paste0(DimensionsDate, DimensionsVarchar) | |
#------------- Generate Measures | |
# Generate default measure - count of primary column | |
MeasurePrimaryKey <- paste(' <Measure name="','Count (PK)','" column="',PrimaryKey,'" aggregator="count" visible="true"></Measure>',sep='') | |
# Initialize empty vectors | |
NumericMeasures <- character(0) | |
DateMeasures <- character(0) | |
# Generate measures for numeric variables | |
if(length(VariablesNumeric) > 0) { | |
for (i in seq_along(NumericAggregators)) { | |
NumericMeasures <- c(NumericMeasures,paste0(' <Measure name="',standardize_name(VariablesNumeric),'-',NumericAggregatorsNames[i] | |
,'" column="',VariablesNumeric,'" aggregator="',NumericAggregators[i],'" visible="true"></Measure>')) | |
} | |
} | |
# Generate measures for date variables | |
if(length(VariablesDate) > 0) { | |
for (i in seq_along(DateAggregators)) { | |
DateMeasures <- c(DateMeasures,paste0(' <Measure name="',standardize_name(VariablesDate),' ',DateAggregatorsNames[i] | |
,'" column="',VariablesDate,'" aggregator="',DateAggregators[i],'" visible="true"></Measure>')) | |
} | |
} | |
# Paste measures together in desired order | |
Measures <- paste0( MeasurePrimaryKey,'\n', paste0(sort(c(NumericMeasures, DateMeasures)),collapse='\n'),collapse='\n') | |
# ---------- Define Header and Footer for cube | |
CubeHeader <- paste0(c( | |
paste0('<Schema name="R Generated Cube">'), | |
paste0(' <Cube name="',TableName,'" visible="true" cache="true" enabled="true">'), | |
paste0(' <Table name="',TableName,'"></Table>')) | |
,collapse='\n') | |
CubeFooter <- '</Cube></Schema>' | |
# --------- Generate connection file | |
ConnectionFile <- paste0('type=OLAP | |
name=RData | |
driver=mondrian.olap4j.MondrianOlap4jDriver | |
location=jdbc:mondrian:Jdbc=jdbc:sqlite:',getwd(),'/__this_is_temporary_db__.db; \\ | |
Catalog=',CubeDestination,';JdbcDrivers=org.sqlite.JDBC | |
') | |
# ---------- Write results to file | |
# Write Cube | |
writeLines(paste0(CubeHeader,'\n',Dimensions,'\n\n',Measures,'\n',CubeFooter),con=CubeDestination) | |
# Write Data source | |
writeLines(ConnectionFile,con=DataSourceDestination) | |
} | |
# Example 1 | |
mtcars2 <- mtcars | |
mtcars2$id <- seq(1:nrow(mtcars2)) | |
mtcars2$vs <- as.integer(mtcars2$vs) | |
mtcars2$am <- as.integer(mtcars2$am) | |
mtcars2$gear <- as.integer(mtcars2$gear) | |
mtcars2$carb <- as.integer(mtcars2$carb) | |
GenerateCubeSQLite(mtcars2, | |
PrimaryKey='id', | |
CubeDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/foodmart/test.xml', | |
DataSourceDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/saiku-datasources/test') | |
# Example 2 | |
big_portfolio <- read.csv(file='big_portfolio.csv',header=TRUE,sep=';',stringsAsFactors=FALSE) | |
big_portfolio$origination_date <- as.Date(big_portfolio$origination_date) | |
big_portfolio$repayment_date <- as.Date(big_portfolio$repayment_date,format='%Y-%m-%d') | |
str(big_portfolio) | |
GenerateCubeSQLite(big_portfolio, | |
PrimaryKey='id', | |
CubeDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/foodmart/test.xml', | |
DataSourceDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/saiku-datasources/test') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment