benmarwick · December 18, 2015 06:39
diff --git a/DFR-compare-files-citations.R b/DFR-compare-files-citations.R
 dfr <- c("2013.6.4.usytW8LZ", 
       "2013.1.1.E84E4jrp", 
       "2012.12.18.98btEyuG", 
       "2013.4.18.N45CdU3m", 
       "2013.4.20.FxFmBVYd",
       "2013.4.29.jWU8ZEvg",
       "2013.5.8.g6VTxDs5",
       "2013.5.15.JkRSEQce")

 # allocate list to hold results of loop...
 out <- vector("list", length=length(dfr))

 # loop to investigate numbers of files and rows for these archives...
 for(i in 1:length(dfr)) {
 # keep track of what we're up to
 print(i)
 # set up directories
 base <- "C:\\Users\\marwick\\Downloads\\"
 path <- paste0(base, dfr[i])
 setwd(path)

 headset = read.csv("citations.CSV", header = TRUE, nrows = 50)
 headclasses = sapply(headset, class)
 # headclasses[grep("factor", headclasses)] = "character"
 headclasses =  rep("character", length(headclasses))

 # now read in file
 cit <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE, 
               stringsAsFactors = FALSE, colClasses=headclasses)

 all_in_csv <- length(cit$id)             
 unq_in_csv <- length(unique(cit$id))     

 # mark duplicates and keep them in the list
 # uni <- make.unique(cit$id)
 # length(uni)                


 # just read in 1st column
 cit1col <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE, 
                    stringsAsFactors = FALSE, colClasses=c("character", rep("NULL", length(headclasses) )))
 firstcol <- nrow(cit1col)
 uni_in_firstcol <- length(unique(cit1col))


 setwd(paste0(path, "\\wordcounts"))
 all_in_dir <- length(list.files())         
 unq_in_dir <- length(unique(list.files()))        


 out[[i]] <- cbind(all_in_csv, unq_in_csv, firstcol, all_in_dir, unq_in_dir)

 }

 [1] 1
 [1] 2
 [1] 3
 [1] 4
 [1] 5
 [1] 6
 [1] 7
 [1] 8
 Warning messages:
 1: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings,  :
  EOF within quoted string
 2: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings,  :
  EOF within quoted string
 3: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings,  :
  EOF within quoted string
 4: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings,  :
  EOF within quoted string
 > out
 [[1]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]      52526      42672    52526      53808      53808

 [[2]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]       3877       3877     3877       3877       3877

 [[3]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]       9000       9000     9000       9000       9000

 [[4]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]        220        220      220        220        220

 [[5]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]       3928       3928     3928       3928       3928

 [[6]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]        986        986      986        986        986

 [[7]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]      77295      77295    77295     150000     150000

 [[8]]
     all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
 [1,]      56952      50201    56952      80685      80685


 # a few problems...
 # 1. Number of CSV files is greater than number of items in citations.CSV file
 # 2. DOIs in citations.CSV file has many duplicates



 # 1 Jul 13
 dfr <- c("2013.6.4.usytW8LZ", 
         "2013.5.8.g6VTxDs5",
         "2013.5.15.JkRSEQce")

 cit <- vector("list", length = length(dfr))
 all_in_csv <- vector("integer", length = length(dfr))
 unq_in_csv <- vector("integer", length = length(dfr))
 for(i in 1:length(dfr)) {
  setwd(paste0("C:\\Users\\marwick\\Downloads\\", dfr[i]))
  cit[[i]] <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE, 
                  stringsAsFactors = FALSE, colClasses= "character", encoding="utf-8",
                  quote = "")
  all_in_csv[i] <- length(cit[[i]]$id)             
  unq_in_csv[i] <- length(unique(cit[[i]]$id)) 
 }

 rbind(all_in_csv, unq_in_csv)
	dfr <- c("2013.6.4.usytW8LZ",
	"2013.1.1.E84E4jrp",
	"2012.12.18.98btEyuG",
	"2013.4.18.N45CdU3m",
	"2013.4.20.FxFmBVYd",
	"2013.4.29.jWU8ZEvg",
	"2013.5.8.g6VTxDs5",
	"2013.5.15.JkRSEQce")

	# allocate list to hold results of loop...
	out <- vector("list", length=length(dfr))

	# loop to investigate numbers of files and rows for these archives...
	for(i in 1:length(dfr)) {
	# keep track of what we're up to
	print(i)
	# set up directories
	base <- "C:\\Users\\marwick\\Downloads\\"
	path <- paste0(base, dfr[i])
	setwd(path)

	headset = read.csv("citations.CSV", header = TRUE, nrows = 50)
	headclasses = sapply(headset, class)
	# headclasses[grep("factor", headclasses)] = "character"
	headclasses = rep("character", length(headclasses))

	# now read in file
	cit <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE,
	stringsAsFactors = FALSE, colClasses=headclasses)

	all_in_csv <- length(cit$id)
	unq_in_csv <- length(unique(cit$id))

	# mark duplicates and keep them in the list
	# uni <- make.unique(cit$id)
	# length(uni)


	# just read in 1st column
	cit1col <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE,
	stringsAsFactors = FALSE, colClasses=c("character", rep("NULL", length(headclasses) )))
	firstcol <- nrow(cit1col)
	uni_in_firstcol <- length(unique(cit1col))


	setwd(paste0(path, "\\wordcounts"))
	all_in_dir <- length(list.files())
	unq_in_dir <- length(unique(list.files()))


	out[[i]] <- cbind(all_in_csv, unq_in_csv, firstcol, all_in_dir, unq_in_dir)

	}

	[1] 1
	[1] 2
	[1] 3
	[1] 4
	[1] 5
	[1] 6
	[1] 7
	[1] 8
	Warning messages:
	1: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
	EOF within quoted string
	2: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
	EOF within quoted string
	3: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
	EOF within quoted string
	4: In scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, :
	EOF within quoted string
	> out
	[[1]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 52526 42672 52526 53808 53808

	[[2]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 3877 3877 3877 3877 3877

	[[3]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 9000 9000 9000 9000 9000

	[[4]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 220 220 220 220 220

	[[5]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 3928 3928 3928 3928 3928

	[[6]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 986 986 986 986 986

	[[7]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 77295 77295 77295 150000 150000

	[[8]]
	all_in_csv unq_in_csv firstcol all_in_dir unq_in_dir
	[1,] 56952 50201 56952 80685 80685


	# a few problems...
	# 1. Number of CSV files is greater than number of items in citations.CSV file
	# 2. DOIs in citations.CSV file has many duplicates



	# 1 Jul 13
	dfr <- c("2013.6.4.usytW8LZ",
	"2013.5.8.g6VTxDs5",
	"2013.5.15.JkRSEQce")

	cit <- vector("list", length = length(dfr))
	all_in_csv <- vector("integer", length = length(dfr))
	unq_in_csv <- vector("integer", length = length(dfr))
	for(i in 1:length(dfr)) {
	setwd(paste0("C:\\Users\\marwick\\Downloads\\", dfr[i]))
	cit[[i]] <- read.csv("citations.CSV", row.names = NULL, comment.char = "", header = TRUE,
	stringsAsFactors = FALSE, colClasses= "character", encoding="utf-8",
	quote = "")
	all_in_csv[i] <- length(cit[[i]]$id)
	unq_in_csv[i] <- length(unique(cit[[i]]$id))
	}

	rbind(all_in_csv, unq_in_csv)