Skip to content

Instantly share code, notes, and snippets.

View dmarcelinobr's full-sized avatar
💭
I may be slow to respond

Daniel Marcelino dmarcelinobr

💭
I may be slow to respond
View GitHub Profile
@dmarcelinobr
dmarcelinobr / dedupe_records_with_less_info.r
Created December 30, 2015 20:03
Scripted example in R of removing records with duplicate IDs but are missing other info
# These column numbers represent fields with name/contact info that I've
# marked with 1s and 0s depending on whether or not there's anything in
# the field.
bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26)
# Now we get the row numbers of all the records with duplicate IDs
dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE)
@dmarcelinobr
dmarcelinobr / crosstabs with tidyr and dplyr
Created December 30, 2015 17:14
crosstabs with tidyr and dplyr
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, tidyr, wakefield)
set.seed(10)
dat <- r_data_frame(n = 10000,
race,
age,
sex
)
@dmarcelinobr
dmarcelinobr / pander.CrossTable.R
Created December 30, 2015 16:58
Test 2: Extend pander to also work with CrossTable R objects
pander.CrossTable <- function(x, ...) {
pandoc.table(x$t, caption = 'Observed Count')
pandoc.table(x$prop.row, caption = 'Row percentages')
pandoc.table(x$prop.col, caption = 'Column percentages')
emphasize.strong.cells (which(x$chisq$residuals > 2,arr.ind=T))
emphasize.strong.cells (which(x$chisq$residuals < -2,arr.ind=T))
pandoc.table(x$chisq$residuals, caption = 'Residuals of the Chi-sqaured test')
@dmarcelinobr
dmarcelinobr / coalesce.R
Created December 30, 2015 16:36
Coalesce.R: A coalesce function for R (returns first non-NA value from a list of vectors)
coalesce<-function(...) {
x<-lapply(list(...), function(z) {if (is.factor(z)) as.character(z) else z})
m<-is.na(x[[1]])
i<-2
while(any(m) & i<=length(x)) {
if ( length(x[[i]])==length(x[[1]])) {
x[[1]][m]<-x[[i]][m]
} else if (length(x[[i]])==1) {
x[[1]][m]<-x[[i]]
} else {
@dmarcelinobr
dmarcelinobr / dplyr_vs_data.table_1.8.11.R
Created December 30, 2015 16:30
Benchmarking dplyr and data.table 1.8.11
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11 For help type: help("data.table")
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
@dmarcelinobr
dmarcelinobr / sample0110b.R
Created December 30, 2015 16:28
blind sample implementation variation
# http://stackoverflow.com/a/30781090/2725969
sample0110b <- function(size, n) {
size <- as.integer(size)
n <- as.integer(n)
if(size > 25 || size < 3L) stop("Size out of valid range")
# Generate integer pool and weights
@dmarcelinobr
dmarcelinobr / regcapturedmatches.R
Created December 30, 2015 16:24
regcapturedmatches.R: extracts captured matches from match data obtained by regexpr, gregexpr or regexec Raw
regcapturedmatches<-function(x,m) {
if (length(x) != length(m))
stop(gettextf("%s and %s must have the same length",
sQuote("x"), sQuote("m")), domain = NA)
ili <- is.list(m)
useBytes <- if (ili) {
any(unlist(lapply(m, attr, "useBytes")))
} else {
@dmarcelinobr
dmarcelinobr / Install_R.R
Created December 30, 2015 15:50
R script and instructions to install commonly used R packages and instructions for using OpenBLAS on OS X via Homebrew Raw
##############################################################################
# title : InstallPackages.R;
# purpose : install R packages commonly used by Adam H.Sparks when upgrading or installing R;
# producer : prepared by A. H. Sparks;
# last update : in Los Baños, Laguna, PHL, May 2015;
# inputs : none;
# outputs : none;
# remarks 1 : in order to download any packages, you need to be on-line, of course;
# remarks 2 : for country outlines and the like see http://www.gadm.org/ to download Rdata packages;
##############################################################################
@dmarcelinobr
dmarcelinobr / fingerprint.r
Created December 30, 2015 15:40
Fingerprint two strings
fingerprint <- function(x, y) {
if (!inherits(x, "character") | !inherits(y, "character")) {
stop("x and y must be character strings")
}
x1 <- strsplit(x, "")[[1]]
y1 <- strsplit(y, "")[[1]]
f <- c(x1, y1)
final <- paste(f[order(f)], collapse = "")
return(final)
@dmarcelinobr
dmarcelinobr / distributed.data.table.R
Last active December 30, 2015 15:25
Distributed processing using data.table
library(Rserve)
library(data.table)
port = 9411:9414 # we have 4 ports designed to run cluster, so up to 4 nodes
dt = data.table(time_year = 2012:2014) # simulate input data from which we dynamically derive partitions by year
partitioning = quote(time_year)
partitions = dt[,unique(eval(as.name(partitioning)))] # extract all partitions from reference dataset
names(partitions) = partitions
port = port[seq_along(partitions)]
names(port) = as.character(partitions)