Last active
December 19, 2015 04:28
-
-
Save benmarwick/5897321 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Citation Network Analysis | |
======================================================== | |
1. Go to http://apps.webofknowledge.com.offcampus.lib.washington.edu/ | |
2. click the 'web of science' tab at the top | |
3. Do a search... refine search by journal... perhaps archaeolog* in 'publication title', hit 'search' | |
4. scroll to bottom of search results, where 'Output Records' section is | |
5. Select records -> 1-500 since 500 is max | |
6. Select content -> Full Record & Cited References | |
7. Select destination -> "save to tab-delimited file (UTF-8)" | |
8. Inspect text file with text editor | |
Load the data into R... | |
```{r} | |
# get files of citations into R | |
# this is the folder that contains the txt files of citations | |
wd <- "C:/Users/marwick/Downloads/AAcitations" | |
setwd(wd) | |
fl <- list.files() | |
require(data.table) | |
arts_list <- lapply(fl, function(i) data.table(read.delim(i, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", row.names = NULL))) | |
# rearrange data tables in list into one big data table | |
arts <- do.call("rbind", arts_list) | |
``` | |
Rearrange the table to split out the citations into another table but keep a lookup value so the new table of citations can be linked to the article in the original table. | |
First, format citations to try and make them consistent. | |
```{r} | |
require(data.table) | |
# format get source article details in the same format as WOS citations | |
ar <- vector("list", length = nrow(arts)) | |
for(i in 1:nrow(arts)) { | |
# get source article details in same format as citations | |
# first author (remove periods) | |
au1 <- toupper(gsub("\\.", "", unlist(strsplit(arts[i, `PT`], ";", fixed=TRUE))[[1]])) | |
# year (with spaces and comma) | |
yr <- paste0(" ", arts[i, PD], ",") | |
# abbreviated journal title (with spaces and comma) | |
bn <- paste0(" ", arts[i, BN], ",") | |
# VNNN | |
vn <- ifelse(is.na(arts[i, PY]), "", paste0(" V", arts[i, PY], ",")) | |
# PNNN | |
pn <- ifelse(is.na(arts[i, MA]), "", paste0(" P", arts[i, MA])) | |
# DOI | |
do <- ifelse(is.na(arts[i, AR]), "", paste0(", DOI ", arts[i, AR])) | |
# put it all together | |
ar[[i]] <- paste0(au1, yr, bn, vn, pn, do) | |
} | |
# put in a list | |
ar1 <- unlist(ar) | |
``` | |
Second, make the table of source article and citations, trying to make the text format consistant on both sides. | |
```{r} | |
require(data.table) | |
# make one data table of all citations for one article | |
cit_list <- vector("list", length = nrow(arts)) | |
for(i in 1:nrow(arts)){ | |
# reformat citations to match source | |
# and improve consistency | |
# get rid of comma between author and year | |
c1 <- sub(",", "", | |
unlist(strsplit(arts[i, FX], "; ")), | |
fixed = TRUE) | |
# get rid of periods with author name | |
temp <- strsplit(c1, ",") | |
c2 <- toupper(unlist(lapply(seq_along(temp), function(x) { | |
t1 <- gsub("\\.", "", temp[[x]][1]) | |
paste(t1, temp[[x]][2], temp[[x]][-c(1, 2)], sep = ",") }))) | |
# control flow in case there are no citations(!) | |
if(length(c2) == 0) { next } else { | |
cit_list[[i]] <- data.table(ar = toupper(unlist(rep(ar[i], length(c2)))), | |
cits = c2 | |
) } | |
} | |
# put list of dts into one dt | |
cits <- do.call("rbind", cit_list) | |
``` | |
Now that we have the data in a table, we can create and plot the network | |
```{r} | |
# trim cits so labels are readable | |
cits1 <- data.table(ar = tolower(substr(cits$ar, 1, 40)), | |
cits = tolower(substr(cits$cits, 1, 40))) | |
require(igraph) | |
g <- graph.edgelist(as.matrix(cits1)) | |
# identify those vertices part of less than n edges | |
n <- 50 | |
g.vs <- V(g)[degree(g) < n] | |
# exclude them from the graph | |
g1 <- delete.vertices(g, g.vs) | |
# remove nodes that don't have a set number of edges | |
g1 <- delete.vertices(g1, which(degree(g1) < n/5)) | |
# make vertex sizes variable | |
V(g1)$size <- degree(g1)/10 | |
# make label sizes variable | |
V(g1)$label.cex <- degree(g1)/10 | |
# basic plot | |
windowsFonts(Arial=windowsFont("Arial")) | |
plot(g1, | |
edge.curved = TRUE, | |
vertex.color= "grey", | |
edge.arrow.size = 0.01, | |
vertex.label.dist = 0.5, | |
vertex.label.color = "grey30", | |
vertex.label.family = "Arial" | |
)#vertex.label = NA) | |
# use a layout | |
layout1 <- layout.fruchterman.reingold(g1, niter=500) | |
plot(g1, layout=layout1, edge.curved = TRUE, vertex.color= "grey", edge.arrow.size = 0.1, vertex.label.dist=0.5, vertex.label = NA) | |
``` | |
```{r} | |
# d3 plot in web browser | |
# from the d3Simple package | |
devtools::install_github("d3Network", "christophergandrud") | |
require(d3Network) | |
d3Network::d3SimpleNetwork(get.data.frame(g1), width = 1500, height = 800, | |
textColour = "green", linkColour = "red", fontsize = 10, | |
charge = -1000, opacity = 0.9, file = "d3net.html") | |
``` | |
We can search for an author. | |
```{r} | |
sear <- cits[grepl("DE LEON", cits$ar), ] | |
# subset graph by label: http://lists.gnu.org/archive/html/igraph-help/2009-10/msg00039.html | |
degree(g, which(V(g)$name %in% c("A", "F", "X"))) | |
# subset | |
subgraph.edges(g, V(g)[edge > 1]) | |
``` | |
```{r} | |
# export in gephi format | |
write.graph(g1, file="g.graphml", format="graphml") | |
# find location of gephi file | |
getwd() | |
``` | |
Background and ideas: | |
http://kieranhealy.org/philcites/ | |
http://kieranhealy.org/blog/archives/2013/06/18/a-co-citation-network-for-philosophy/ | |
http://nealcaren.web.unc.edu/a-sociology-citation-network/ | |
Choosing community detection algos: | |
http://igraph.wikidot.com/community-detection-in-r | |
http://bommaritollc.com/2012/06/17/summary-community-detection-algorithms-igraph-0-6/?utm_source=rss&utm_medium=rss&utm_campaign=summary-community-detection-algorithms-igraph-0-6 | |
http://stackoverflow.com/a/9478989/1036500 | |
```{r, echo=FALSE, message=FALSE, eval=FALSE} | |
# This chunck is to run the code and generate the PDF | |
# Load packages | |
setwd(wd) # assumes wd has been set earlier in the doc | |
require(knitr) | |
require(markdown) | |
# process .md and .pdf files (including smart punctuation and grey background of code blocks) | |
filen <- "citation_net" # name of this markdown file without suffix | |
knit(paste0(filen,".rmd")) | |
system(paste0("pandoc -s ", paste0(filen,".md"), " -t latex -o ", paste0(filen,".pdf"), " --highlight-style=tango -S")) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment