Last active
December 18, 2015 06:39
-
-
Save benmarwick/5741408 to your computer and use it in GitHub Desktop.
Comparing the number of CSV files and the number of rows and unique IDs in the citations.CSV in DFR archives.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dfr <- c("2013.6.4.usytW8LZ", | |
"2013.1.1.E84E4jrp", | |
"2012.12.18.98btEyuG", | |
"2013.4.18.N45CdU3m", | |
"2013.4.20.FxFmBVYd", | |
"2013.4.29.jWU8ZEvg", | |
"2013.5.8.g6VTxDs5", | |
"2013.5.15.JkRSEQce") | |
# allocate list to hold results of loop... | |
out <- vector("list", length=length(dfr)) | |
# loop to investigate numbers of files and rows for these archives... | |
for(i in 1:length(dfr)) { | |
# keep track of what we're up to | |
print(i) | |
# set up directories | |
base <- "C:\\Users\\marwick\\Downloads\\" | |
path <- paste0(base, dfr[i]) | |
setwd(path) | |
headset = read.csv("citations.CSV", header = TRUE, nrows = 50) | |
headclasses = sapply(headset, class) | |
# headclasses[grep("factor", headclasses)] = "character" | |
headclasses = rep("character", length(headclasses)) | |
# now read in file | |
cit <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE, | |
stringsAsFactors = FALSE, colClasses=headclasses) | |
all_in_csv <- length(cit$id) | |
unq_in_csv <- length(unique(cit$id)) | |
# mark duplicates and keep them in the list | |
# uni <- make.unique(cit$id) | |
# length(uni) | |
# just read in 1st column | |
cit1col <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE, | |
stringsAsFactors = FALSE, colClasses=c("character", rep("NULL", length(headclasses) ))) | |
firstcol <- nrow(cit1col) | |
uni_in_firstcol <- length(unique(cit1col)) | |
setwd(paste0(path, "\\wordcounts")) | |
all_in_dir <- length(list.files()) | |
unq_in_dir <- length(unique(list.files())) | |
out[[i]] <- cbind(all_in_csv, unq_in_csv, firstcol, all_in_dir, unq_in_dir) | |
} | |
[1] 1 | |
[1] 2 | |
[1] 3 | |
[1] 4 | |
[1] 5 | |
[1] 6 | |
[1] 7 | |
[1] 8 | |
Warning messages: | |
1: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, : | |
EOF within quoted string | |
2: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, : | |
EOF within quoted string | |
3: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, : | |
EOF within quoted string | |
4: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, : | |
EOF within quoted string | |
> out | |
[[1]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 52526 42672 52526 53808 53808 | |
[[2]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 3877 3877 3877 3877 3877 | |
[[3]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 9000 9000 9000 9000 9000 | |
[[4]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 220 220 220 220 220 | |
[[5]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 3928 3928 3928 3928 3928 | |
[[6]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 986 986 986 986 986 | |
[[7]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 77295 77295 77295 150000 150000 | |
[[8]] | |
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir | |
[1,] 56952 50201 56952 80685 80685 | |
# a few problems... | |
# 1. Number of CSV files is greater than number of items in citations.CSV file | |
# 2. DOIs in citations.CSV file has many duplicates | |
# 1 Jul 13 | |
dfr <- c("2013.6.4.usytW8LZ", | |
"2013.5.8.g6VTxDs5", | |
"2013.5.15.JkRSEQce") | |
cit <- vector("list", length = length(dfr)) | |
all_in_csv <- vector("integer", length = length(dfr)) | |
unq_in_csv <- vector("integer", length = length(dfr)) | |
for(i in 1:length(dfr)) { | |
setwd(paste0("C:\\Users\\marwick\\Downloads\\", dfr[i])) | |
cit[[i]] <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE, | |
stringsAsFactors = FALSE, colClasses= "character", encoding="utf-8", | |
quote = "") | |
all_in_csv[i] <- length(cit[[i]]$id) | |
unq_in_csv[i] <- length(unique(cit[[i]]$id)) | |
} | |
rbind(all_in_csv, unq_in_csv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment