Created
May 4, 2012 01:42
-
-
Save inkhorn/2591152 to your computer and use it in GitHub Desktop.
Scripted example in R of removing records with duplicate IDs but are missing other info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# These column numbers represent fields with name/contact info that I've | |
# marked with 1s and 0s depending on whether or not there's anything in | |
# the field. | |
bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26) | |
# Now we get the row numbers of all the records with duplicate IDs | |
dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE) | |
# Now we get the actual ID values that were duplicated | |
dupe_ids = big.dataset$ID[which(duplicated(big.dataset$ID) == TRUE)] | |
# The following line gives us a sum of the number of fields, for each | |
# record, that have information in them | |
info_sums = apply(big.dataset[dupe_conids_rows,bio_cols],1,sum) | |
# Next we build a data frame that glues this information together. The | |
# goal is to isolate those rows representing records with missing info, | |
# and then delete them from the larger data set | |
a = data.frame(sums=info_sums,ids=dupe_ids,rows=dupe_id_rows) | |
# Now we order the data frame by the ID values | |
a = a[order(a[,2]),] | |
# Here we get the mean number of fields, for each ID value, filled out | |
# between duplicates that have more information, and duplicates that have | |
# missing information | |
a$avg = rep(tapply(a$sums, a$ids, mean), tapply(a$sums, a$ids, length)) | |
# Here we make a column that marks a row for deletion (with a 1 instead | |
# of a 0) if the sum of fields that have information in them is less | |
# than the mean number for that ID value. Those rows that are marked | |
# are the duplicates that are missing info. | |
a$del = ifelse(a$sums < a$avg,1,0) | |
# The following is a full list of unique row numbers from the bigger | |
# dataset that constitute duplicates that are missing information. | |
rows_to_del = a$rows[a$del == 1] | |
# Here we delete those rows from the bigger dataset | |
big.dataset = big.dataset[-rows_to_del,] | |
# Since all remaining dupes are exact duplicates, this line removes them | |
big.dataset = big.dataset[which(duplicated(big.dataset$ID) == FALSE),] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment