njahn82 · October 16, 2015 13:56
diff --git a/report_vu_doaj.md b/report_vu_doaj.md
diff --git a/report_vu_doaj.Rmd b/report_vu_doaj.Rmd

 ## Load Data

 ```{r}
 require(dplyr)
 # load ISI spradsheet and select only columns needed
 vu_amst <- read.csv("pubs_2012-14 V5 CSV.csv", header = TRUE, sep =";", 
                    na.strings = "") %>% 
  select(JI, PY, SN, PU, DI, UT)
 tbl_df(vu_amst)

 # load DOAJ spreadsheet and select only columns needed
 doaj <- httr::content(httr::GET("http://doaj.org/csv")) %>%
  select(Journal.ISSN..print.version.,Journal.EISSN..online.version.,Journal.article.processing.charges..APCs.,First.calendar.year.journal.provided.online.Open.Access.content)
 tbl_df(doaj)
 ```

 ## Prepare Match

 ```{r}
 # join ISSN and EISSN into one vector
 doaj.issn <- c(as.character(doaj$Journal.ISSN..print.version.), 
               as.character(doaj$Journal.EISSN..online.version.))
 doaj.issn <- doaj.issn[!doaj.issn == ""]

 # convert to class character
 vu_amst$SN <- as.character(vu_amst$SN) 
 ```

 ## Match

 ```{r}
 # match with the vu dataset
 vu_amst$DOAJ <- vu_amst$SN %in% doaj.issn
 table(vu_amst$DOAJ)
 ```

 ## Merge

 ```{r}
 tt <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.ISSN..print.version.")
 tt_2 <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.EISSN..online.version.")
 colnames(tt_2) <- colnames(tt)
 vu_doaj <- rbind(tt, tt_2)
 ```

 It is important to check for journals that were transferred to OA and exclude those article that were published before. This is especially important for the SCOAP3 journals

 ```{r}
 vu_doaj <- filter(vu_doaj, PY >= First.calendar.year.journal.provided.online.Open.Access.content)
 ```

 ## Explore Data

 Let's drop levels not required before exploring the data

 ```{r}
 vu_doaj <- droplevels(vu_doaj)
 ```

 ### By Year published

 ```{r}
 # create matrix
 by_year <- rbind(all = table(vu_amst$PY), doaj = table(vu_doaj$PY), share = 
                   table(vu_doaj$PY) / table(vu_amst$PY) * 100)
 # print
 knitr::kable(by_year, digits = 2)
 ```

 ### By publisher

 Publisher names are a bit messy in the Web of Science. Before we tabulate the OA-publishers by year, let's clean up some publisher names:

 ```{r}
 vu_doaj$PU[grep("Wiley", vu_doaj$PU, ignore.case = T)] <- "WILEY-BLACKWELL"
 vu_doaj$PU[grep("FRONTIERS", vu_doaj$PU, ignore.case = T)] <- "FRONTIERS RESEARCH FOUNDATION"
 vu_doaj$PU[grep("ELSEVIER", vu_doaj$PU, ignore.case = T)] <- "ELSEVIER SCIENCE BV"
 ```

 We've identified `r length(unique(vu_doaj$PU))` OA publishers. To calculate OA Gold publications over publishers:

 ```{r}
 count(vu_doaj, PU) %>% arrange(desc(n))
 ```

 Plot OA Gold publications over publishers and year published

 ```{r}
 # take only the five most popular publishers
 vu_doaj$PU <- factor(vu_doaj$PU, 
                           levels =  c(rownames(data.frame(rev(sort(table(vu_doaj$PU)))))))
 levels(vu_doaj$PU)[6:length(levels(vu_doaj$PU))] <- paste("other (n=", 
                                                                      length(unique(vu_doaj$PU)) - 5, ")", sep= "")

 require(dplyr)
 publisher_by_yr <- group_by(vu_doaj, PU, PY) %>% tally()
 publisher_by_yr

 require(ggplot2)
 ggplot(publisher_by_yr, aes(factor(PY), n, fill = PU, group = PU)) + geom_area(position = 'stack' ) + scale_fill_manual("Publisher", values = c("#f39c12", "#2980b9", "#2ecc71", "#fb8072","#ffffb3",  "#bdc3c7")) + 
 xlab("Year") + ylab("ISI OA Gold articles") + theme_bw()
 ```

 ### By journal

 ```{r}
 # relevel by journal 
 vu_doaj$JI <- factor(vu_doaj$JI, 
                           levels =  c(rownames(data.frame(rev(sort(table(vu_doaj$JI)))))))

 group_by(vu_doaj, JI, PU) %>% tally()
 ```


diff --git a/unnamed-chunk-10-1.png b/unnamed-chunk-10-1.png
	2012	2013	2014
all	5181.00	5526.00	5730.00
doaj	493.00	635.00	751.00
share	9.52	11.49	13.11

	## Load Data

	```{r}
	require(dplyr)
	# load ISI spradsheet and select only columns needed
	vu_amst <- read.csv("pubs_2012-14 V5 CSV.csv", header = TRUE, sep =";",
	na.strings = "") %>%
	select(JI, PY, SN, PU, DI, UT)
	tbl_df(vu_amst)

	# load DOAJ spreadsheet and select only columns needed
	doaj <- httr::content(httr::GET("http://doaj.org/csv")) %>%
	select(Journal.ISSN..print.version.,Journal.EISSN..online.version.,Journal.article.processing.charges..APCs.,First.calendar.year.journal.provided.online.Open.Access.content)
	tbl_df(doaj)
	```

	## Prepare Match

	```{r}
	# join ISSN and EISSN into one vector
	doaj.issn <- c(as.character(doaj$Journal.ISSN..print.version.),
	as.character(doaj$Journal.EISSN..online.version.))
	doaj.issn <- doaj.issn[!doaj.issn == ""]

	# convert to class character
	vu_amst$SN <- as.character(vu_amst$SN)
	```

	## Match

	```{r}
	# match with the vu dataset
	vu_amst$DOAJ <- vu_amst$SN %in% doaj.issn
	table(vu_amst$DOAJ)
	```

	## Merge

	```{r}
	tt <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.ISSN..print.version.")
	tt_2 <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.EISSN..online.version.")
	colnames(tt_2) <- colnames(tt)
	vu_doaj <- rbind(tt, tt_2)
	```

	It is important to check for journals that were transferred to OA and exclude those article that were published before. This is especially important for the SCOAP3 journals

	```{r}
	vu_doaj <- filter(vu_doaj, PY >= First.calendar.year.journal.provided.online.Open.Access.content)
	```

	## Explore Data

	Let's drop levels not required before exploring the data

	```{r}
	vu_doaj <- droplevels(vu_doaj)
	```

	### By Year published

	```{r}
	# create matrix
	by_year <- rbind(all = table(vu_amst$PY), doaj = table(vu_doaj$PY), share =
	table(vu_doaj$PY) / table(vu_amst$PY) * 100)
	# print
	knitr::kable(by_year, digits = 2)
	```

	### By publisher

	Publisher names are a bit messy in the Web of Science. Before we tabulate the OA-publishers by year, let's clean up some publisher names:

	```{r}
	vu_doaj$PU[grep("Wiley", vu_doaj$PU, ignore.case = T)] <- "WILEY-BLACKWELL"
	vu_doaj$PU[grep("FRONTIERS", vu_doaj$PU, ignore.case = T)] <- "FRONTIERS RESEARCH FOUNDATION"
	vu_doaj$PU[grep("ELSEVIER", vu_doaj$PU, ignore.case = T)] <- "ELSEVIER SCIENCE BV"
	```

	We've identified `r length(unique(vu_doaj$PU))` OA publishers. To calculate OA Gold publications over publishers:

	```{r}
	count(vu_doaj, PU) %>% arrange(desc(n))
	```

	Plot OA Gold publications over publishers and year published

	```{r}
	# take only the five most popular publishers
	vu_doaj$PU <- factor(vu_doaj$PU,
	levels = c(rownames(data.frame(rev(sort(table(vu_doaj$PU)))))))
	levels(vu_doaj$PU)[6:length(levels(vu_doaj$PU))] <- paste("other (n=",
	length(unique(vu_doaj$PU)) - 5, ")", sep= "")

	require(dplyr)
	publisher_by_yr <- group_by(vu_doaj, PU, PY) %>% tally()
	publisher_by_yr

	require(ggplot2)
	ggplot(publisher_by_yr, aes(factor(PY), n, fill = PU, group = PU)) + geom_area(position = 'stack' ) + scale_fill_manual("Publisher", values = c("#f39c12", "#2980b9", "#2ecc71", "#fb8072","#ffffb3", "#bdc3c7")) +
	xlab("Year") + ylab("ISI OA Gold articles") + theme_bw()
	```

	### By journal

	```{r}
	# relevel by journal
	vu_doaj$JI <- factor(vu_doaj$JI,
	levels = c(rownames(data.frame(rev(sort(table(vu_doaj$JI)))))))

	group_by(vu_doaj, JI, PU) %>% tally()
	```