Last active
August 29, 2015 14:10
-
-
Save leoluyi/3b5735b0d75b0b4a06d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## === 第一次用R需安裝套件 === | |
list.of.packages <- | |
c("plyr", "xlsx", "tools","dplyr","openxlsx", | |
"sampling", "magrittr", "sjmisc") | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages) | |
rm(list.of.packages, new.packages) | |
## -- 更新 package ---- | |
if(packageDescription("openxlsx")$Version < "2.3.11") { | |
devtools::install_github("awalker89/openxlsx") | |
} | |
as.strataCrossN <- function(.strataPctList, .namesList, N){ | |
## error handler | |
if (length(.strataPctList) != length(.namesList)) | |
stop("strataPctList和namesList長度不一致") | |
if (is.list(.strataPctList) != TRUE) stop(".strataPctList must have class \"list\"") | |
if (is.list(.namesList) != TRUE) stop(".strataPctList must have class \"list\"") | |
for (i in seq_along(strataPctList)){ | |
if (i ==1) {strataCrossArr <- strataPctList[[1]]} | |
else {strataCrossArr <- outer(strataCrossArr, strataPctList[[i]])} | |
} | |
strataCrossArr <- as.array(strataCrossArr) | |
dimnames(strataCrossArr) <- .namesList | |
## 抽樣陣列 | |
N <- as.integer(N) | |
strataCrossN <- round(strataCrossArr*N, 0); # sum(strataCrossN) # 總數 | |
## 平面化 | |
strataCrossN_flat = as.data.frame.table(strataCrossN) | |
## 或用{plyr}做成平面化 | |
# library(plyr) | |
# adply(strataCrossN, c(1,2,3)) | |
return(strataCrossN_flat) | |
} | |
margin_prop <- function(.strataCrossN_flat, | |
export = FALSE, | |
outputfile="R實抽樣本temp.csv") | |
{ | |
library(dplyr) | |
## 實抽樣本expanded | |
strataCrossN.expanded <- .strataCrossN_flat[ | |
rep(row.names(.strataCrossN_flat), .strataCrossN_flat$Freq), | |
1:length(.strataCrossN_flat)] | |
strataCrossN.expanded <- strataCrossN.expanded[-length(strataCrossN.expanded)] | |
Ndim <- length(strataCrossN.expanded) | |
總抽樣本 <- margin.table(table(strataCrossN.expanded)) # Sum total | |
## 建立list | |
mar_tab <- function(x) { | |
prop.table(table(x, deparse.level = 0)) | |
} | |
tab <- function(x) table(x, deparse.level = 0) | |
probList <- c(總樣本數 = 總抽樣本, | |
lapply(strataCrossN.expanded, FUN = mar_tab)) | |
freqList <- lapply(strataCrossN.expanded, FUN = tab) | |
print(freqList) | |
cat(paste0(rep("-", 40), collapse = ""), "\n") | |
## 實抽樣本寫入txt | |
#cat(.strataCrossN_flat$Freq, file = "實抽樣本vector.txt", sep = "\n") | |
if(export) { | |
write.csv(.strataCrossN_flat, file = outputfile, row.names=FALSE) | |
} | |
probList <- c(probList[1], lapply(probList[-1], format, digits=3, nsmall=3)) | |
probList | |
} | |
renew_sampleN <- function (.strataCrossN_flat, outputfile="R實抽樣本temp.csv") { | |
## 實抽樣本從txt重讀 | |
#實抽樣本 <- as.numeric(readLines("實抽樣vector本.txt")) | |
實抽樣本 <- read.csv(file = outputfile,header=TRUE)$Freq | |
.strataCrossN_flat$Freq <- 實抽樣本 # 更新 | |
## 實抽樣本寫入txt | |
#cat(.strataCrossN_flat$Freq, file = "實抽樣本vector.txt", sep = "\n") | |
write.csv(.strataCrossN_flat, file = outputfile, row.names=FALSE) | |
return(.strataCrossN_flat) | |
} | |
# ## 讀取raw data | |
# read.data.xlsx3 <- function (.fileName, .sheetIndex=1, .colIndex=NULL, | |
# .colClasses="character", encoding="unknown") | |
# { | |
# library(openxlsx) | |
# .data <- openxlsx::read.xlsx( | |
# xlsxFile = .fileName, | |
# sheet = .sheetIndex, | |
# colNames = T | |
# ) | |
# return(.data) | |
# } | |
# ## 讀取raw data | |
# read.data.xlsx2 <- function (.fileName, .sheetIndex=1, .colIndex=NULL, | |
# .colClasses="character", encoding="unknown") | |
# { | |
# jgc <- function(){ | |
# .jcall("java/lang/System", method = "gc") | |
# } | |
# | |
# jgc() | |
# | |
# options(java.parameters = "-Xmx4g") # read.xlsx增加記憶體 | |
# | |
# .data <- xlsx::read.xlsx2( | |
# file = .fileName, | |
# sheetIndex = .sheetIndex, | |
# header = T, | |
# colClasses = .colClasses, | |
# colIndex = .colIndex, # 只擷取前十欄 | |
# stringsAsFactors = FALSE | |
# ) | |
# return(.data) | |
# } | |
# ## 讀取raw data | |
# read.data.xlsx <- function (.fileName, .sheetIndex=1, .colIndex=NULL, | |
# .colClasses="character", encoding="unknown") | |
# { | |
# jgc <- function(){ | |
# .jcall("java/lang/System", method = "gc") | |
# } | |
# | |
# jgc() | |
# | |
# .data <- xlsx::read.xlsx( | |
# file = .fileName, | |
# sheetIndex = .sheetIndex, | |
# header = T, | |
# colClasses = .colClasses, | |
# colIndex = .colIndex, # 只擷取前十欄 | |
# encoding = encoding, | |
# stringsAsFactors = FALSE | |
# ) | |
# return(.data) | |
# } | |
## sample compare | |
sampleC <- function (.data, | |
.strataCrossN_flat, | |
.vars, | |
.namesList) { | |
library(dplyr) | |
# 防呆 | |
if (length(.vars) != length(.namesList)) | |
stop("length(.vars) must equal to length(.namesList)") | |
if (is.data.frame(.data) != TRUE) | |
stop(".data must have class \"data.frame\"") | |
ID_var_name <- names(.data)[grep("ID$", names(.data), ignore.case = TRUE)][[1]] | |
if(length(ID_var_name)!=0) | |
if(anyDuplicated(.data[[ID_var_name]])) { | |
stop(c("ID有重複: ", "\n", | |
paste0(unique( | |
.data[[ID_var_name]][duplicated(.data[[ID_var_name]])]), | |
collapse = "\n"))) | |
} | |
var_names <- names(.namesList) | |
# 排除不在list的樣本 | |
for(i in seq_along(.namesList)){ | |
filter_criteria <- lazyeval::interp(~ which_column %in% .namesList[[i]], | |
which_column = as.name(.vars[i])) | |
.data <- .data %>% filter_(filter_criteria) | |
} | |
# .data check levels | |
for(i in seq_along(.namesList)){ | |
.data[[.vars[i]]] <- .data[[.vars[i]]] %>% | |
factor(levels=.namesList[[i]]) | |
} | |
# .strataCrossN_flat check levels | |
for(i in seq_along(.namesList)){ | |
.strataCrossN_flat[[var_names[i]]] <- .strataCrossN_flat[[var_names[i]]] %>% | |
factor(levels=.namesList[[i]]) | |
} | |
# .strataCrossN_flat 排序 | |
.strataCrossN_flat <- .strataCrossN_flat[c(var_names, "Freq")] | |
.strataCrossN_flat <- .strataCrossN_flat %>% | |
arrange_(.dots = rev(names(.strataCrossN_flat)[-length(.strataCrossN_flat)])) | |
# 樣本分佈 | |
sampleCross_flat <- table( | |
.data[.vars]) %>% # (Q1R..為excel檔表頭變數名) | |
as.data.frame.table() | |
# sampleCross_flat 排序 | |
sampleCross_flat <- sampleCross_flat %>% arrange_(.dots = rev(.vars)) | |
# 檢查多寡 | |
ncolStrata = ncol(.strataCrossN_flat) | |
ncolSample = ncol(sampleCross_flat) | |
n1 <- sampleCross_flat[["Freq"]] | |
n2 <- .strataCrossN_flat[["Freq"]] | |
# 防呆 | |
if (length(n1) != length(n2)) stop("樣本組合與分層數不相等") | |
strataCompare <- | |
cbind(.strataCrossN_flat[!(names(.strataCrossN_flat) %in% c("Freq"))], | |
應抽樣本數 = n2, | |
Completed_N = n1, | |
樣本多寡 = n1 - n2 | |
) | |
return(strataCompare) | |
} | |
read.strata <- function(strata_file, .namesList) | |
{ | |
.strataCrossN_flat <- read.csv(strata_file, header = TRUE, | |
na.strings = "", stringsAsFactors=F) | |
var_names <- names(.namesList) | |
for(i in seq_along(.namesList)){ | |
.strataCrossN_flat[[var_names[i]]] <- .strataCrossN_flat[[var_names[i]]] %>% | |
factor(levels=.namesList[[i]]) | |
} | |
.strataCrossN_flat | |
} | |
strata.sample.ID <- function (.data, .strataCrossN_flat, .namesList, | |
.vars, ID_var_name="Panel.ID", | |
method=c("srswor","srswr","poisson","systematic"), | |
pik=NULL) | |
{ | |
## Possible error: | |
# Error in data.frame(..., check.names = FALSE) : | |
# arguments imply differing number of rows: 0, 1 | |
# | |
# The problem lies in your trying to set the sample size from | |
# some groups equal to zero. Instead, subset your original data before sampling. | |
# | |
# data.reordered <- subset(data.reordered, !(Q1R==1 && Q2R==2 && Q3R==4) && | |
# !(Q1R==2 && Q2R==2 && Q3R==4)) | |
# 實抽樣本 <- 實抽樣本[實抽樣本!=0] | |
library(dplyr) | |
library(sampling) | |
if(anyDuplicated(.data[[ID_var_name]])) { | |
stop(c("抽取之ID有重複: ", "\n", | |
paste0(.data[[ID_var_name]][duplicated(.data[[ID_var_name]])], | |
collapse = "\n"))) | |
} | |
var_names <- names(.namesList) | |
# .strataCrossN_flat check levels | |
for(i in seq_along(.namesList)){ | |
.strataCrossN_flat[[var_names[i]]] <- .strataCrossN_flat[[var_names[i]]] %>% | |
factor(levels=.namesList[[i]]) | |
} | |
# .strataCrossN_flat 排序 | |
.strataCrossN_flat <- .strataCrossN_flat[,c(var_names, "Freq")] | |
.strataCrossN_flat <- .strataCrossN_flat %>% | |
arrange_(.dots = rev(names(.strataCrossN_flat)[-length(.strataCrossN_flat)])) | |
# .strataCrossN_flat 移除Freq == 0 | |
.strataCrossN_flat <- .strataCrossN_flat %>% | |
filter(!(Freq == 0)) | |
N_vector <- .strataCrossN_flat$Freq | |
# .data排除重複id | |
.data <- .data %>% distinct_(ID_var_name) | |
# .data排除不在list的樣本 | |
for(i in seq_along(.namesList)){ | |
filter_criteria <- lazyeval::interp(~ which_column %in% .namesList[[i]], | |
which_column = as.name(.vars[i])) | |
.data <- .data %>% filter_(filter_criteria) | |
} | |
# .data check levels | |
for(i in seq_along(.namesList)){ | |
.data[[.vars[i]]] <- .data[[.vars[i]]] %>% | |
factor(levels=.namesList[[i]]) | |
} | |
# .data 排序 (strata抽樣樣本要先排序) | |
data.reordered <- arrange_(.data, .dots = rev(.vars)) ## 以後面變數為第一排序層級 | |
# 抽出之ID | |
ID_Output <- data.reordered %>% | |
strata(stratanames=.vars, size=N_vector, | |
method, .data[[pik]]) %>% | |
getdata(data.reordered,.) %>% | |
dplyr::select_(ID_var_name) %>% unlist %>% unname | |
return(ID_Output) | |
} | |
write.output.xlsx3 <- function(ID_sample, .data, ID_var_name){ | |
# ID_var_name: ID所在變數名 | |
# ID_sample: 要抽取的ID | |
library(tools) | |
library(openxlsx) | |
# 防呆 | |
if(anyDuplicated(ID_sample)) { | |
stop(c("抽取之ID有重複: ", "\n", | |
paste0(ID_sample[duplicated(ID_sample)], | |
collapse = "\n"))) | |
} | |
# .data排除重複id | |
.data <- .data %>% distinct_(ID_var_name) | |
tempData <- .data[.data[[ID_var_name]] %in% ID_sample,] | |
tempData_backup <- .data[!(.data[[ID_var_name]] %in% ID_sample),] | |
outputfileName <- paste(file_path_sans_ext(fileName), | |
"抽樣後raw_data.xlsx",sep = "_") | |
# 轉為data.frame | |
is_tbl <- inherits(.data, "tbl") | |
if(!is.data.frame(.data)) { | |
.data <- dplyr:::as_data_frame(.data) | |
} else if (is_tbl) { | |
.data <- dplyr:::tbl_df(.data) | |
} | |
wb <- openxlsx::createWorkbook( | |
creator = paste0("EOLembrain", Sys.getenv("USERNAME"), sep="_")) | |
openxlsx::addWorksheet(wb, paste("N=", length(unique(ID_sample)),sep="")) | |
openxlsx::addWorksheet(wb, "backup") | |
openxlsx::writeData(wb, paste("N=", length(unique(ID_sample)),sep=""), x = tempData, | |
rowNames = FALSE, keepNA=FALSE) | |
openxlsx::writeData(wb, "backup", x = tempData_backup, | |
rowNames = FALSE, keepNA=FALSE) | |
saveWorkbook(wb, file = outputfileName, overwrite = TRUE) | |
} | |
# write.output.xlsx2 <- function(ID_sample, .data, ID_var_name){ | |
# | |
# # ID_var_name: ID所在變數名 | |
# # ID_sample: 要抽取的ID | |
# | |
# options(java.parameters = "-Xmx4g") # read.xlsx增加記憶體 | |
# | |
# library(tools) | |
# library(xlsx) | |
# | |
# jgc <- function(){ | |
# .jcall("java/lang/System", method = "gc") | |
# } | |
# | |
# jgc() | |
# | |
# # 防呆 | |
# if(anyDuplicated(ID_sample)) { | |
# stop(c("抽取之ID有重複: ", "\n", | |
# paste0(ID_sample[duplicated(ID_sample)], | |
# collapse = "\n"))) | |
# } | |
# | |
# # .data排除重複id | |
# .data <- .data %>% distinct_(ID_var_name) | |
# | |
# tempData <- .data[.data[[ID_var_name]] %in% ID_sample,] | |
# tempData_backup <- .data[!(.data[[ID_var_name]] %in% ID_sample),] | |
# | |
# outputfileName <- paste(file_path_sans_ext(fileName), | |
# "抽樣後raw_data.xlsx",sep = "_") | |
# write.xlsx2(x = tempData, file = outputfileName, | |
# sheetName = paste("N=", length(unique(ID_sample)),sep=""), | |
# row.names = FALSE, showNA=FALSE) | |
# write.xlsx2(x = tempData_backup, file = outputfileName, append=TRUE, | |
# sheetName = "backup", | |
# row.names = FALSE, showNA=FALSE) | |
# } | |
# write.output.xlsx <- function(ID_sample, .data, ID_var_name){ | |
# | |
# # ID_var_name: ID所在變數名 | |
# # ID_sample: 要抽取的ID | |
# | |
# library(tools) | |
# library(xlsx) | |
# | |
# jgc <- function(){ | |
# .jcall("java/lang/System", method = "gc") | |
# } | |
# | |
# jgc() | |
# | |
# # 防呆 | |
# if(anyDuplicated(ID_sample)) { | |
# stop(c("抽取之ID有重複: ", "\n", | |
# paste0(ID_sample[duplicated(ID_sample)], | |
# collapse = "\n"))) | |
# } | |
# | |
# # .data排除重複id | |
# .data <- .data %>% distinct_(ID_var_name) | |
# | |
# tempData <- .data[.data[[ID_var_name]] %in% ID_sample,] | |
# tempData_backup <- .data[!(.data[[ID_var_name]] %in% ID_sample),] | |
# | |
# outputfileName <- paste(file_path_sans_ext(fileName), | |
# "抽樣後raw_data.xlsx",sep = "_") | |
# write.xlsx(x = tempData, file = outputfileName, | |
# sheetName = paste("N=", length(unique(ID_sample)),sep=""), | |
# row.names = FALSE, showNA=FALSE) | |
# write.xlsx(x = tempData_backup, file = outputfileName, append=TRUE, | |
# sheetName = "backup", row.names = FALSE, showNA=FALSE) | |
# } | |
write.output.csv <- function(ID_sample, .data, ID_var_name){ | |
# ID_var_name: ID所在變數名 | |
# ID_sample: 要抽取的ID | |
library(tools) | |
# 防呆 | |
if(anyDuplicated(ID_sample)) { | |
stop(c("抽取之ID有重複: ", "\n", | |
paste0(ID_sample[duplicated(ID_sample)], | |
collapse = "\n"))) | |
} | |
# .data排除重複id | |
.data <- .data %>% distinct_(ID_var_name) | |
tempData <- .data[.data[[ID_var_name]] %in% ID_sample,] | |
tempData_backup <- .data[!(.data[[ID_var_name]] %in% ID_sample),] | |
outputfileName <- paste(file_path_sans_ext(fileName), | |
"抽樣後raw_data.csv",sep = "_") | |
write.csv(x = tempData, file = outputfileName, | |
quote = T, na = "", row.names=F) | |
write.csv(x = tempData_backup, | |
file = paste0(file_path_sans_ext(fileName), "_backup.csv"), | |
quote = T, na = "", row.names=F) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment