Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active December 18, 2015 06:39
Show Gist options
  • Save benmarwick/5741408 to your computer and use it in GitHub Desktop.
Save benmarwick/5741408 to your computer and use it in GitHub Desktop.
Comparing the number of CSV files and the number of rows and unique IDs in the citations.CSV in DFR archives.
dfr <- c("2013.6.4.usytW8LZ",
"2013.1.1.E84E4jrp",
"2012.12.18.98btEyuG",
"2013.4.18.N45CdU3m",
"2013.4.20.FxFmBVYd",
"2013.4.29.jWU8ZEvg",
"2013.5.8.g6VTxDs5",
"2013.5.15.JkRSEQce")
# allocate list to hold results of loop...
out <- vector("list", length=length(dfr))
# loop to investigate numbers of files and rows for these archives...
for(i in 1:length(dfr)) {
# keep track of what we're up to
print(i)
# set up directories
base <- "C:\\Users\\marwick\\Downloads\\"
path <- paste0(base, dfr[i])
setwd(path)
headset = read.csv("citations.CSV", header = TRUE, nrows = 50)
headclasses = sapply(headset, class)
# headclasses[grep("factor", headclasses)] = "character"
headclasses = rep("character", length(headclasses))
# now read in file
cit <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE,
stringsAsFactors = FALSE, colClasses=headclasses)
all_in_csv <- length(cit$id)
unq_in_csv <- length(unique(cit$id))
# mark duplicates and keep them in the list
# uni <- make.unique(cit$id)
# length(uni)
# just read in 1st column
cit1col <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE,
stringsAsFactors = FALSE, colClasses=c("character", rep("NULL", length(headclasses) )))
firstcol <- nrow(cit1col)
uni_in_firstcol <- length(unique(cit1col))
setwd(paste0(path, "\\wordcounts"))
all_in_dir <- length(list.files())
unq_in_dir <- length(unique(list.files()))
out[[i]] <- cbind(all_in_csv, unq_in_csv, firstcol, all_in_dir, unq_in_dir)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
Warning messages:
1: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
EOF within quoted string
2: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
EOF within quoted string
3: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
EOF within quoted string
4: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
EOF within quoted string
> out
[[1]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 52526 42672 52526 53808 53808
[[2]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 3877 3877 3877 3877 3877
[[3]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 9000 9000 9000 9000 9000
[[4]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 220 220 220 220 220
[[5]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 3928 3928 3928 3928 3928
[[6]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 986 986 986 986 986
[[7]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 77295 77295 77295 150000 150000
[[8]]
all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
[1,] 56952 50201 56952 80685 80685
# a few problems...
# 1. Number of CSV files is greater than number of items in citations.CSV file
# 2. DOIs in citations.CSV file has many duplicates
# 1 Jul 13
dfr <- c("2013.6.4.usytW8LZ",
"2013.5.8.g6VTxDs5",
"2013.5.15.JkRSEQce")
cit <- vector("list", length = length(dfr))
all_in_csv <- vector("integer", length = length(dfr))
unq_in_csv <- vector("integer", length = length(dfr))
for(i in 1:length(dfr)) {
setwd(paste0("C:\\Users\\marwick\\Downloads\\", dfr[i]))
cit[[i]] <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE,
stringsAsFactors = FALSE, colClasses= "character", encoding="utf-8",
quote = "")
all_in_csv[i] <- length(cit[[i]]$id)
unq_in_csv[i] <- length(unique(cit[[i]]$id))
}
rbind(all_in_csv, unq_in_csv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment