Created
November 22, 2018 20:30
-
-
Save tomfbush/a2972816f4c8a849484c75a4a32f5e18 to your computer and use it in GitHub Desktop.
Clean a specific column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to substitute matching string and everything after it with nothing | |
# note leading space if that applies to your dataset | |
removeCrap <- function(crapThing) { | |
gsub(" (0|[1-9][0-9]*)\ ?(GB).*$", "", crapThing) | |
} | |
# load data from csv into a dataframe | |
d <- read.csv("input.csv", sep = ",") | |
# display a few rows of the dataframe 'd' | |
# note name of column we want to change in this case is Col_1 | |
head(d) | |
# apply the removeCrap() function to each row of the correct column | |
# by referencing dataframe 'd', column 'Col_1' (known together as d$Col_1) | |
# and output to a separate 'factor' called cleanCol | |
cleanCol <- sapply(d$Col_1, FUN = removeCrap) | |
# bind the original data and the clean column | |
cleanData <- cbind(cleanCol, d) | |
# output the data to a new csv file, not using any row numbers | |
# (cos Excel has those already) | |
write.csv(cleanData, "output.csv", row.names = F) | |
# APPENDIX | |
# creating fake data and writing to a file to read in above this | |
# col1 <- c("some nice text 9GB blah blah", "some other text 10 GB nonsense with a space") | |
# col2 <- c(12, 24) | |
# testData <- data.frame(col1, col2) | |
# colnames(testData) <- c("Col_1", "Col_2") | |
# write.csv(testData, "input.csv", row.names = F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment