benmarwick · December 19, 2015 04:28
diff --git a/citation-network-sketch.rmd b/citation-network-sketch.rmd
 Citation Network Analysis
 ========================================================


 1. Go to http://apps.webofknowledge.com.offcampus.lib.washington.edu/
 2. click the 'web of science' tab at the top
 3. Do a search...  refine search by journal... perhaps archaeolog* in 'publication title', hit 'search'
 4. scroll to bottom of search results, where 'Output Records' section is
 5. Select records -> 1-500 since 500 is max 
 6. Select content -> Full Record & Cited References
 7. Select destination -> "save to tab-delimited file (UTF-8)"
 8. Inspect text file with text editor

 Load the data into R...
 ```{r}
 # get files of citations into R
 # this is the folder that contains the txt files of citations
 wd <- "C:/Users/marwick/Downloads/AAcitations"
 setwd(wd)
 fl <- list.files()
 require(data.table)
 arts_list <- lapply(fl, function(i) data.table(read.delim(i, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", row.names = NULL)))
 # rearrange data tables in list into one big data table
 arts <- do.call("rbind", arts_list)
 ```
 Rearrange the table to split out the citations into another table but keep a lookup value so the new table of citations can be linked to the article in the original table.

 First, format citations to try and make them consistent. 
 ```{r}
 require(data.table)
 # format get source article details in the same format as WOS citations
 ar <- vector("list", length = nrow(arts))
 for(i in 1:nrow(arts)) {
 # get source article details in same format as citations 
 # first author (remove periods)
 au1 <- toupper(gsub("\\.", "", unlist(strsplit(arts[i, `ï»¿PT`], ";", fixed=TRUE))[[1]]))
 # year (with spaces and comma)
 yr <- paste0(" ", arts[i, PD], ",")
 # abbreviated journal title (with spaces and comma)
 bn <- paste0(" ", arts[i, BN], ",")
 # VNNN
 vn <- ifelse(is.na(arts[i, PY]), "", paste0(" V", arts[i, PY], ","))
 # PNNN
 pn <-  ifelse(is.na(arts[i, MA]), "", paste0(" P", arts[i, MA]))
 # DOI
 do <- ifelse(is.na(arts[i, AR]), "", paste0(", DOI ", arts[i, AR]))
 # put it all together
 ar[[i]] <- paste0(au1, yr, bn, vn, pn, do)
 }
 # put in a list
 ar1 <- unlist(ar)
 ```
 Second, make the table of source article and citations, trying to make the text format consistant on both sides.
 ```{r}
 require(data.table)
 # make one data table of all citations for one article
 cit_list <- vector("list", length = nrow(arts))
 for(i in 1:nrow(arts)){
  # reformat citations to match source
  # and improve consistency
  # get rid of comma between author and year
  c1 <- sub(",", "",  
            unlist(strsplit(arts[i, FX], "; ")), 
            fixed = TRUE)  
  # get rid of periods with author name
 temp <- strsplit(c1, ",")
 c2 <- toupper(unlist(lapply(seq_along(temp), function(x) {
  t1 <- gsub("\\.", "", temp[[x]][1])
  paste(t1, temp[[x]][2], temp[[x]][-c(1, 2)], sep = ",") })))

  # control flow in case there are no citations(!)
  if(length(c2) == 0) { next } else {
  cit_list[[i]] <- data.table(ar = toupper(unlist(rep(ar[i], length(c2)))),
                   cits = c2                 
                   ) }
 }
 # put list of dts into one dt
 cits <- do.call("rbind", cit_list)
 ```
 Now that we have the data in a table, we can create and plot the network
 ```{r}
 # trim cits so labels are readable
 cits1 <- data.table(ar = tolower(substr(cits$ar, 1, 40)), 
                    cits = tolower(substr(cits$cits, 1, 40)))
 require(igraph)
 g <- graph.edgelist(as.matrix(cits1))
 # identify those vertices part of less than n edges
 n <- 50
 g.vs <- V(g)[degree(g) < n] 
 # exclude them from the graph
 g1 <- delete.vertices(g, g.vs) 
 # remove nodes that don't have a set number of edges
 g1 <- delete.vertices(g1, which(degree(g1) < n/5))
 # make vertex sizes variable
 V(g1)$size <- degree(g1)/10
 # make label sizes variable
 V(g1)$label.cex <- degree(g1)/10
 # basic plot
 windowsFonts(Arial=windowsFont("Arial"))
 plot(g1, 
     edge.curved = TRUE,   
     vertex.color= "grey", 
     edge.arrow.size = 0.01, 
     vertex.label.dist = 0.5,  
     vertex.label.color = "grey30", 
     vertex.label.family = "Arial" 
     )#vertex.label = NA)
 # use a layout
 layout1 <-  layout.fruchterman.reingold(g1, niter=500)
 plot(g1, layout=layout1, edge.curved = TRUE,  vertex.color= "grey", edge.arrow.size = 0.1, vertex.label.dist=0.5, vertex.label = NA)

 ```


 ```{r}
 # d3 plot in web browser
 # from the d3Simple package
 devtools::install_github("d3Network", "christophergandrud")
 require(d3Network)
 d3Network::d3SimpleNetwork(get.data.frame(g1), width = 1500, height = 800, 
                textColour = "green", linkColour = "red", fontsize = 10,
                charge = -1000, opacity = 0.9, file = "d3net.html")

 ```


 We can search for an author.
 ```{r}
 sear <- cits[grepl("DE LEON", cits$ar), ]
 # subset graph by label: http://lists.gnu.org/archive/html/igraph-help/2009-10/msg00039.html
 degree(g, which(V(g)$name %in% c("A", "F", "X")))
 # subset
 subgraph.edges(g, V(g)[edge > 1])
 ```

 ```{r}
 # export in gephi format
 write.graph(g1, file="g.graphml", format="graphml")
 # find location of gephi file
 getwd()
 ```


 Background and ideas:

 http://kieranhealy.org/philcites/
 http://kieranhealy.org/blog/archives/2013/06/18/a-co-citation-network-for-philosophy/
 http://nealcaren.web.unc.edu/a-sociology-citation-network/

 Choosing community detection algos:

 http://igraph.wikidot.com/community-detection-in-r
 http://bommaritollc.com/2012/06/17/summary-community-detection-algorithms-igraph-0-6/?utm_source=rss&utm_medium=rss&utm_campaign=summary-community-detection-algorithms-igraph-0-6
 http://stackoverflow.com/a/9478989/1036500


 ```{r, echo=FALSE, message=FALSE, eval=FALSE}
 # This chunck is to run the code and generate the PDF
 # Load packages
 setwd(wd) # assumes wd has been set earlier in the doc
 require(knitr)
 require(markdown)
 
 # process .md and .pdf files (including smart punctuation and grey background of code blocks)
 filen <- "citation_net" # name of this markdown file without suffix
 knit(paste0(filen,".rmd"))
 system(paste0("pandoc -s ", paste0(filen,".md"), " -t latex -o ", paste0(filen,".pdf"), " --highlight-style=tango  -S"))
 ```
	Citation Network Analysis
	========================================================


	1. Go to http://apps.webofknowledge.com.offcampus.lib.washington.edu/
	2. click the 'web of science' tab at the top
	3. Do a search... refine search by journal... perhaps archaeolog* in 'publication title', hit 'search'
	4. scroll to bottom of search results, where 'Output Records' section is
	5. Select records -> 1-500 since 500 is max
	6. Select content -> Full Record & Cited References
	7. Select destination -> "save to tab-delimited file (UTF-8)"
	8. Inspect text file with text editor

	Load the data into R...
	```{r}
	# get files of citations into R
	# this is the folder that contains the txt files of citations
	wd <- "C:/Users/marwick/Downloads/AAcitations"
	setwd(wd)
	fl <- list.files()
	require(data.table)
	arts_list <- lapply(fl, function(i) data.table(read.delim(i, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", row.names = NULL)))
	# rearrange data tables in list into one big data table
	arts <- do.call("rbind", arts_list)
	```
	Rearrange the table to split out the citations into another table but keep a lookup value so the new table of citations can be linked to the article in the original table.

	First, format citations to try and make them consistent.
	```{r}
	require(data.table)
	# format get source article details in the same format as WOS citations
	ar <- vector("list", length = nrow(arts))
	for(i in 1:nrow(arts)) {
	# get source article details in same format as citations
	# first author (remove periods)
	au1 <- toupper(gsub("\\.", "", unlist(strsplit(arts[i, `ï»¿PT`], ";", fixed=TRUE))[[1]]))
	# year (with spaces and comma)
	yr <- paste0(" ", arts[i, PD], ",")
	# abbreviated journal title (with spaces and comma)
	bn <- paste0(" ", arts[i, BN], ",")
	# VNNN
	vn <- ifelse(is.na(arts[i, PY]), "", paste0(" V", arts[i, PY], ","))
	# PNNN
	pn <- ifelse(is.na(arts[i, MA]), "", paste0(" P", arts[i, MA]))
	# DOI
	do <- ifelse(is.na(arts[i, AR]), "", paste0(", DOI ", arts[i, AR]))
	# put it all together
	ar[[i]] <- paste0(au1, yr, bn, vn, pn, do)
	}
	# put in a list
	ar1 <- unlist(ar)
	```
	Second, make the table of source article and citations, trying to make the text format consistant on both sides.
	```{r}
	require(data.table)
	# make one data table of all citations for one article
	cit_list <- vector("list", length = nrow(arts))
	for(i in 1:nrow(arts)){
	# reformat citations to match source
	# and improve consistency
	# get rid of comma between author and year
	c1 <- sub(",", "",
	unlist(strsplit(arts[i, FX], "; ")),
	fixed = TRUE)
	# get rid of periods with author name
	temp <- strsplit(c1, ",")
	c2 <- toupper(unlist(lapply(seq_along(temp), function(x) {
	t1 <- gsub("\\.", "", temp[[x]][1])
	paste(t1, temp[[x]][2], temp[[x]][-c(1, 2)], sep = ",") })))

	# control flow in case there are no citations(!)
	if(length(c2) == 0) { next } else {
	cit_list[[i]] <- data.table(ar = toupper(unlist(rep(ar[i], length(c2)))),
	cits = c2
	) }
	}
	# put list of dts into one dt
	cits <- do.call("rbind", cit_list)
	```
	Now that we have the data in a table, we can create and plot the network
	```{r}
	# trim cits so labels are readable
	cits1 <- data.table(ar = tolower(substr(cits$ar, 1, 40)),
	cits = tolower(substr(cits$cits, 1, 40)))
	require(igraph)
	g <- graph.edgelist(as.matrix(cits1))
	# identify those vertices part of less than n edges
	n <- 50
	g.vs <- V(g)[degree(g) < n]
	# exclude them from the graph
	g1 <- delete.vertices(g, g.vs)
	# remove nodes that don't have a set number of edges
	g1 <- delete.vertices(g1, which(degree(g1) < n/5))
	# make vertex sizes variable
	V(g1)$size <- degree(g1)/10
	# make label sizes variable
	V(g1)$label.cex <- degree(g1)/10
	# basic plot
	windowsFonts(Arial=windowsFont("Arial"))
	plot(g1,
	edge.curved = TRUE,
	vertex.color= "grey",
	edge.arrow.size = 0.01,
	vertex.label.dist = 0.5,
	vertex.label.color = "grey30",
	vertex.label.family = "Arial"
	)#vertex.label = NA)
	# use a layout
	layout1 <- layout.fruchterman.reingold(g1, niter=500)
	plot(g1, layout=layout1, edge.curved = TRUE, vertex.color= "grey", edge.arrow.size = 0.1, vertex.label.dist=0.5, vertex.label = NA)

	```


	```{r}
	# d3 plot in web browser
	# from the d3Simple package
	devtools::install_github("d3Network", "christophergandrud")
	require(d3Network)
	d3Network::d3SimpleNetwork(get.data.frame(g1), width = 1500, height = 800,
	textColour = "green", linkColour = "red", fontsize = 10,
	charge = -1000, opacity = 0.9, file = "d3net.html")

	```


	We can search for an author.
	```{r}
	sear <- cits[grepl("DE LEON", cits$ar), ]
	# subset graph by label: http://lists.gnu.org/archive/html/igraph-help/2009-10/msg00039.html
	degree(g, which(V(g)$name %in% c("A", "F", "X")))
	# subset
	subgraph.edges(g, V(g)[edge > 1])
	```

	```{r}
	# export in gephi format
	write.graph(g1, file="g.graphml", format="graphml")
	# find location of gephi file
	getwd()
	```


	Background and ideas:

	http://kieranhealy.org/philcites/
	http://kieranhealy.org/blog/archives/2013/06/18/a-co-citation-network-for-philosophy/
	http://nealcaren.web.unc.edu/a-sociology-citation-network/

	Choosing community detection algos:

	http://igraph.wikidot.com/community-detection-in-r
	http://bommaritollc.com/2012/06/17/summary-community-detection-algorithms-igraph-0-6/?utm_source=rss&utm_medium=rss&utm_campaign=summary-community-detection-algorithms-igraph-0-6
	http://stackoverflow.com/a/9478989/1036500


	```{r, echo=FALSE, message=FALSE, eval=FALSE}
	# This chunck is to run the code and generate the PDF
	# Load packages
	setwd(wd) # assumes wd has been set earlier in the doc
	require(knitr)
	require(markdown)

	# process .md and .pdf files (including smart punctuation and grey background of code blocks)
	filen <- "citation_net" # name of this markdown file without suffix
	knit(paste0(filen,".rmd"))
	system(paste0("pandoc -s ", paste0(filen,".md"), " -t latex -o ", paste0(filen,".pdf"), " --highlight-style=tango -S"))
	```