Daniel Marcelino dmarcelinobr

💭

I may be slow to respond

Data-driven journalist at @JOTAJornalismo who loves algos, predictive analytics and computational journalism. I do things in R/Python/SQL

dmarcelinobr / dedupe_records_with_less_info.r

Created December 30, 2015 20:03

Scripted example in R of removing records with duplicate IDs but are missing other info

	# These column numbers represent fields with name/contact info that I've
	# marked with 1s and 0s depending on whether or not there's anything in
	# the field.

	bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26)

	# Now we get the row numbers of all the records with duplicate IDs

	dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE)

dmarcelinobr / crosstabs with tidyr and dplyr

Created December 30, 2015 17:14

crosstabs with tidyr and dplyr

	if (!require("pacman")) install.packages("pacman")
	pacman::p_load(dplyr, tidyr, wakefield)

	set.seed(10)
	dat <- r_data_frame(n = 10000,
	race,
	age,
	sex
	)

dmarcelinobr / pander.CrossTable.R

Created December 30, 2015 16:58

Test 2: Extend pander to also work with CrossTable R objects




	pander.CrossTable <- function(x, ...) {
	pandoc.table(x$t, caption = 'Observed Count')
	pandoc.table(x$prop.row, caption = 'Row percentages')
	pandoc.table(x$prop.col, caption = 'Column percentages')
	emphasize.strong.cells (which(x$chisq$residuals > 2,arr.ind=T))
	emphasize.strong.cells (which(x$chisq$residuals < -2,arr.ind=T))
	pandoc.table(x$chisq$residuals, caption = 'Residuals of the Chi-sqaured test')

dmarcelinobr / coalesce.R

Created December 30, 2015 16:36

Coalesce.R: A coalesce function for R (returns first non-NA value from a list of vectors)

	coalesce<-function(...) {
	x<-lapply(list(...), function(z) {if (is.factor(z)) as.character(z) else z})
	m<-is.na(x[[1]])
	i<-2
	while(any(m) & i<=length(x)) {
	if ( length(x[[i]])==length(x[[1]])) {
	x[[1]][m]<-x[[i]][m]
	} else if (length(x[[i]])==1) {
	x[[1]][m]<-x[[i]]
	} else {

dmarcelinobr / dplyr_vs_data.table_1.8.11.R

Created December 30, 2015 16:30

Benchmarking dplyr and data.table 1.8.11

	# version 1.8.11 (commit 1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT

dmarcelinobr / sample0110b.R

Created December 30, 2015 16:28

blind sample implementation variation

	# http://stackoverflow.com/a/30781090/2725969


	sample0110b <- function(size, n) {
	size <- as.integer(size)
	n <- as.integer(n)
	if(size > 25 \|\| size < 3L) stop("Size out of valid range")

	# Generate integer pool and weights

dmarcelinobr / regcapturedmatches.R

Created December 30, 2015 16:24

regcapturedmatches.R: extracts captured matches from match data obtained by regexpr, gregexpr or regexec Raw

	regcapturedmatches<-function(x,m) {

	if (length(x) != length(m))
	stop(gettextf("%s and %s must have the same length",
	sQuote("x"), sQuote("m")), domain = NA)

	ili <- is.list(m)
	useBytes <- if (ili) {
	any(unlist(lapply(m, attr, "useBytes")))
	} else {

dmarcelinobr / Install_R.R

Created December 30, 2015 15:50

R script and instructions to install commonly used R packages and instructions for using OpenBLAS on OS X via Homebrew Raw

	##############################################################################
	# title : InstallPackages.R;
	# purpose : install R packages commonly used by Adam H.Sparks when upgrading or installing R;
	# producer : prepared by A. H. Sparks;
	# last update : in Los Baños, Laguna, PHL, May 2015;
	# inputs : none;
	# outputs : none;
	# remarks 1 : in order to download any packages, you need to be on-line, of course;
	# remarks 2 : for country outlines and the like see http://www.gadm.org/ to download Rdata packages;
	##############################################################################

dmarcelinobr / fingerprint.r

Created December 30, 2015 15:40

Fingerprint two strings

	fingerprint <- function(x, y) {
	if (!inherits(x, "character") \| !inherits(y, "character")) {
	stop("x and y must be character strings")
	}

	x1 <- strsplit(x, "")[[1]]
	y1 <- strsplit(y, "")[[1]]
	f <- c(x1, y1)
	final <- paste(f[order(f)], collapse = "")
	return(final)

dmarcelinobr / distributed.data.table.R

Last active December 30, 2015 15:25

Distributed processing using data.table

	library(Rserve)
	library(data.table)

	port = 9411:9414 # we have 4 ports designed to run cluster, so up to 4 nodes
	dt = data.table(time_year = 2012:2014) # simulate input data from which we dynamically derive partitions by year
	partitioning = quote(time_year)
	partitions = dt[,unique(eval(as.name(partitioning)))] # extract all partitions from reference dataset
	names(partitions) = partitions
	port = port[seq_along(partitions)]
	names(port) = as.character(partitions)