tomfbush · November 22, 2018 20:30
diff --git a/clean.r b/clean.r
 # function to substitute matching string and everything after it with nothing
 # note leading space if that applies to your dataset
 removeCrap <- function(crapThing) {
    gsub(" (0|[1-9][0-9]*)\ ?(GB).*$", "", crapThing)
 }

 # load data from csv into a dataframe
 d <- read.csv("input.csv", sep = ",")

 # display a few rows of the dataframe 'd'
 # note name of column we want to change in this case is Col_1
 head(d)

 # apply the removeCrap() function to each row of the correct column
 # by referencing dataframe 'd', column 'Col_1' (known together as d$Col_1)
 # and output to a separate 'factor' called cleanCol
 cleanCol <- sapply(d$Col_1, FUN = removeCrap)

 # bind the original data and the clean column
 cleanData <- cbind(cleanCol, d)

 # output the data to a new csv file, not using any row numbers
 # (cos Excel has those already)
 write.csv(cleanData, "output.csv", row.names = F)

 # APPENDIX
 # creating fake data and writing to a file to read in above this
 # col1 <- c("some nice text 9GB blah blah", "some other text 10 GB nonsense with a space")
 # col2 <- c(12, 24)
 # testData <- data.frame(col1, col2)
 # colnames(testData) <- c("Col_1", "Col_2")
 # write.csv(testData, "input.csv", row.names = F)
	# function to substitute matching string and everything after it with nothing
	# note leading space if that applies to your dataset
	removeCrap <- function(crapThing) {
	gsub(" (0\|[1-9][0-9])\ ?(GB).$", "", crapThing)
	}

	# load data from csv into a dataframe
	d <- read.csv("input.csv", sep = ",")

	# display a few rows of the dataframe 'd'
	# note name of column we want to change in this case is Col_1
	head(d)

	# apply the removeCrap() function to each row of the correct column
	# by referencing dataframe 'd', column 'Col_1' (known together as d$Col_1)
	# and output to a separate 'factor' called cleanCol
	cleanCol <- sapply(d$Col_1, FUN = removeCrap)

	# bind the original data and the clean column
	cleanData <- cbind(cleanCol, d)

	# output the data to a new csv file, not using any row numbers
	# (cos Excel has those already)
	write.csv(cleanData, "output.csv", row.names = F)

	# APPENDIX
	# creating fake data and writing to a file to read in above this
	# col1 <- c("some nice text 9GB blah blah", "some other text 10 GB nonsense with a space")
	# col2 <- c(12, 24)
	# testData <- data.frame(col1, col2)
	# colnames(testData) <- c("Col_1", "Col_2")
	# write.csv(testData, "input.csv", row.names = F)