Last active
August 30, 2018 23:50
-
-
Save derekpowell/9c8e346d15c3ee4a61e4a40464eeae86 to your computer and use it in GitHub Desktop.
Redact qualtrics workerId and IPAddress from data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Derek Powell | |
# Date: 8/30/18, 4:49 PM | |
# --- | |
# Script to redact workerIds and ip addresses from qualtrics files. | |
# Script looks for "date_private/" directory, saves resulting data in "data" directory. | |
# Personal info is replaced with a "hash" using xxhash64, | |
# a super fast hash algo w/ short resulting hashes (confirmed appropriate for this use) | |
suppressMessages(library(tidyverse)) | |
suppressMessages(library(digest)) | |
currentDir <- getwd() | |
make_salted <- function(x) { | |
paste0(x, "mySuperSecretSalt") # arbitrary "salt" (optional) | |
} | |
make_hash <- function(x){ | |
x <- make_salted(x) | |
xxhash <- partial(digest, algo="xxhash64") | |
sapply(x, xxhash) | |
} | |
parent_directory <- function(dirString) { | |
ind <- as.numeric(gregexpr("/",dirString)[[1]]) | |
substring(dirString,0,ind[length(ind)]-1) | |
} | |
# privateDirs <- paste0(currentDir,"/Data_private/") | |
privateDirList <- list.files(path=currentDir, pattern="/*_private", recursive=TRUE, include.dirs=TRUE) | |
for (d in privateDirList) { | |
privateDir <- paste0(currentDir,"/",d) | |
workingDir <- parent_directory(privateDir) | |
fileList <- list.files(privateDir, pattern="*.csv") | |
for (f in fileList) { | |
inFile <- paste0(privateDir, "/",f) | |
df <- read.csv(inFile, stringsAsFactors = FALSE) | |
if ("workerId" %in% colnames(df)) { | |
df <- df %>% mutate(workerId=make_hash(workerId)) | |
print(paste(f,"--- workerId variable redacted")) | |
} | |
else {print(paste(f, "--- no workerId variable present"))} | |
if ("IPAddress" %in% colnames(df)) { | |
df <- df %>% mutate(IPAddress=make_hash(IPAddress)) | |
print(paste(f,"--- IPAddress variable redacted")) | |
} | |
else {print(paste(f, "--- no IPAddress variable present"))} | |
if ("LocationLatitude" %in% colnames(df)) { | |
df <- df %>% mutate(LocationLatitude = round(as.numeric(LocationLatitude),2)) | |
print(paste(f,"--- LocationLatitude variable rounded")) | |
} | |
else {print(paste(f, "--- no LocationLatitude variable present"))} | |
if ("LocationLongitude" %in% colnames(df)) { | |
df <- df %>% mutate(LocationLongitude = round(as.numeric(LocationLongitude),2)) | |
print(paste(f,"--- LocationLongitude variable rounded")) | |
} | |
else {print(paste(f, "--- no LocationLongitude variable present"))} | |
if (!"Data" %in% list.files(workingDir)) { | |
dir.create(paste0(workingDir,"/data")) | |
} | |
outFile <- paste0(workingDir,"/data/", f) | |
write.csv(df, file=outFile) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Script to Anonymize Data
This is a script for anonymizing anonymize data inside a project before pushing to github. The script
redact.R
searches recursively for any folders matching/*_private
, takes any .csv files it finds in those folders and hashes "workerId" data and "IPAddress" data fields. The resulting anonymized data is then saved in a/Data
folder at the same level of the directory structure as the corresponding*_private
folder. If the/Data
folder doesn't exist at that level of the hierarchy, it is created.The script assumes those variable names based on the default Qualtrics .csv output.
A .gitignore file ensures any private folders and the
redact.R
are not comitted to git and won't make their way to github.Here's a sample directory structure before running
redact.R
:And here's that same structure after running
redact.R
:Finally, here's what you need to have in the .gitignore: