Skip to content

Instantly share code, notes, and snippets.

@njahn82
Last active October 16, 2015 13:56
Show Gist options
  • Select an option

  • Save njahn82/b39934e28b6a4fb59f9a to your computer and use it in GitHub Desktop.

Select an option

Save njahn82/b39934e28b6a4fb59f9a to your computer and use it in GitHub Desktop.
ISI - DOAJ match for VU Amsterdam
title author date output keep_md
Gold OA output VU Amsterdam - How to
Najko Jahn
16. Oktober 2015
html_document
true

Load Data

require(dplyr)
# load ISI spradsheet and select only columns needed
vu_amst <- read.csv("pubs_2012-14 V5 CSV.csv", header = TRUE, sep =";", 
                    na.strings = "") %>% 
  select(JI, PY, SN, PU, DI, UT)
tbl_df(vu_amst)
## Source: local data frame [16,438 x 6]
## 
##                  JI    PY        SN                     PU
##              (fctr) (int)    (fctr)                 (fctr)
## 1     Retrovirology  2014 1742-4690     BIOMED CENTRAL LTD
## 2  BMC Infect. Dis.  2014 1471-2334     BIOMED CENTRAL LTD
## 3       BMC Neurol.  2014 1471-2377     BIOMED CENTRAL LTD
## 4          PLoS One  2014 1932-6203 PUBLIC LIBRARY SCIENCE
## 5          PLoS One  2014 1932-6203 PUBLIC LIBRARY SCIENCE
## 6          PLoS One  2014 1932-6203 PUBLIC LIBRARY SCIENCE
## 7          PLoS One  2014 1932-6203 PUBLIC LIBRARY SCIENCE
## 8          PLoS One  2014 1932-6203 PUBLIC LIBRARY SCIENCE
## 9   Psychiatry Res.  2014 0165-1781   ELSEVIER IRELAND LTD
## 10 Hydrol. Process.  2014 0885-6087        WILEY-BLACKWELL
## ..              ...   ...       ...                    ...
## Variables not shown: DI (fctr), UT (fctr)
# load DOAJ spreadsheet and select only columns needed
doaj <- httr::content(httr::GET("http://doaj.org/csv")) %>%
  select(Journal.ISSN..print.version.,Journal.EISSN..online.version.,Journal.article.processing.charges..APCs.,First.calendar.year.journal.provided.online.Open.Access.content)
tbl_df(doaj)
## Source: local data frame [10,609 x 4]
## 
##    Journal.ISSN..print.version. Journal.EISSN..online.version.
##                           (chr)                          (chr)
## 1                     0001-3765                               
## 2                     0001-494X                      2282-0035
## 3                     0001-5113                      1846-0453
## 4                     0001-527X                      1734-154X
## 5                     0001-5555                      1651-2057
## 6                     0001-6012                               
## 7                     0001-625X                      2353-074X
## 8                     0001-6977                      2083-9480
## 9                     0001-7019                      1846-0410
## 10                    0001-7213                      1801-7576
## ..                          ...                            ...
## Variables not shown: Journal.article.processing.charges..APCs. (chr),
##   First.calendar.year.journal.provided.online.Open.Access.content (int)

Prepare Match

# join ISSN and EISSN into one vector
doaj.issn <- c(as.character(doaj$Journal.ISSN..print.version.), 
               as.character(doaj$Journal.EISSN..online.version.))
doaj.issn <- doaj.issn[!doaj.issn == ""]

# convert to class character
vu_amst$SN <- as.character(vu_amst$SN) 

Match

# match with the vu dataset
vu_amst$DOAJ <- vu_amst$SN %in% doaj.issn
table(vu_amst$DOAJ)
## 
## FALSE  TRUE 
## 14514  1924

Merge

tt <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.ISSN..print.version.")
tt_2 <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.EISSN..online.version.")
colnames(tt_2) <- colnames(tt)
vu_doaj <- rbind(tt, tt_2)

It is important to check for journals that were transferred to OA and exclude those article that were published before. This is especially important for the SCOAP3 journals

vu_doaj <- filter(vu_doaj, PY >= First.calendar.year.journal.provided.online.Open.Access.content)

Explore Data

Let's drop levels not required before exploring the data

vu_doaj <- droplevels(vu_doaj)

By Year published

# create matrix
by_year <- rbind(all = table(vu_amst$PY), doaj = table(vu_doaj$PY), share = 
                   table(vu_doaj$PY) / table(vu_amst$PY) * 100)
# print
knitr::kable(by_year, digits = 2)
2012 2013 2014
all 5181.00 5526.00 5730.00
doaj 493.00 635.00 751.00
share 9.52 11.49 13.11

By publisher

Publisher names are a bit messy in the Web of Science. Before we tabulate the OA-publishers by year, let's clean up some publisher names:

vu_doaj$PU[grep("Wiley", vu_doaj$PU, ignore.case = T)] <- "WILEY-BLACKWELL"
vu_doaj$PU[grep("FRONTIERS", vu_doaj$PU, ignore.case = T)] <- "FRONTIERS RESEARCH FOUNDATION"
vu_doaj$PU[grep("ELSEVIER", vu_doaj$PU, ignore.case = T)] <- "ELSEVIER SCIENCE BV"

We've identified 105 OA publishers. To calculate OA Gold publications over publishers:

count(vu_doaj, PU) %>% arrange(desc(n))
## Source: local data frame [105 x 2]
## 
##                                PU     n
##                            (fctr) (int)
## 1              BIOMED CENTRAL LTD   621
## 2          PUBLIC LIBRARY SCIENCE   545
## 3     COPERNICUS GESELLSCHAFT MBH   101
## 4   FRONTIERS RESEARCH FOUNDATION   100
## 5       FERRATA STORTI FOUNDATION    38
## 6         NATURE PUBLISHING GROUP    35
## 7                         MDPI AG    34
## 8          JMIR PUBLICATIONS, INC    30
## 9  HINDAWI PUBLISHING CORPORATION    28
## 10                WILEY-BLACKWELL    27
## ..                            ...   ...

Plot OA Gold publications over publishers and year published

# take only the five most popular publishers
vu_doaj$PU <- factor(vu_doaj$PU, 
                           levels =  c(rownames(data.frame(rev(sort(table(vu_doaj$PU)))))))
levels(vu_doaj$PU)[6:length(levels(vu_doaj$PU))] <- paste("other (n=", 
                                                                      length(unique(vu_doaj$PU)) - 5, ")", sep= "")

require(dplyr)
publisher_by_yr <- group_by(vu_doaj, PU, PY) %>% tally()
publisher_by_yr
## Source: local data frame [18 x 3]
## Groups: PU [?]
## 
##                               PU    PY     n
##                           (fctr) (int) (int)
## 1             BIOMED CENTRAL LTD  2012   175
## 2             BIOMED CENTRAL LTD  2013   217
## 3             BIOMED CENTRAL LTD  2014   229
## 4         PUBLIC LIBRARY SCIENCE  2012   148
## 5         PUBLIC LIBRARY SCIENCE  2013   210
## 6         PUBLIC LIBRARY SCIENCE  2014   187
## 7    COPERNICUS GESELLSCHAFT MBH  2012    27
## 8    COPERNICUS GESELLSCHAFT MBH  2013    34
## 9    COPERNICUS GESELLSCHAFT MBH  2014    40
## 10 FRONTIERS RESEARCH FOUNDATION  2012    17
## 11 FRONTIERS RESEARCH FOUNDATION  2013    26
## 12 FRONTIERS RESEARCH FOUNDATION  2014    57
## 13     FERRATA STORTI FOUNDATION  2012     6
## 14     FERRATA STORTI FOUNDATION  2013     8
## 15     FERRATA STORTI FOUNDATION  2014    24
## 16                 other (n=100)  2012   120
## 17                 other (n=100)  2013   140
## 18                 other (n=100)  2014   214
require(ggplot2)
ggplot(publisher_by_yr, aes(factor(PY), n, fill = PU, group = PU)) + geom_area(position = 'stack' ) + scale_fill_manual("Publisher", values = c("#f39c12", "#2980b9", "#2ecc71", "#fb8072","#ffffb3",  "#bdc3c7")) + 
xlab("Year") + ylab("ISI OA Gold articles") + theme_bw()

plot of chunk unnamed-chunk-10

By journal

# relevel by journal 
vu_doaj$JI <- factor(vu_doaj$JI, 
                           levels =  c(rownames(data.frame(rev(sort(table(vu_doaj$JI)))))))

group_by(vu_doaj, JI, PU) %>% tally()
## Source: local data frame [253 x 3]
## Groups: JI [?]
## 
##                                 JI                            PU     n
##                             (fctr)                        (fctr) (int)
## 1                         PLoS One        PUBLIC LIBRARY SCIENCE   493
## 2                BMC Public Health            BIOMED CENTRAL LTD    89
## 3                   BMC Psychiatry            BIOMED CENTRAL LTD    41
## 4  Int. J. Behav. Nutr. Phys. Act.            BIOMED CENTRAL LTD    40
## 5                  Front. Psychol. FRONTIERS RESEARCH FOUNDATION    34
## 6                  BMC Fam. Pract.            BIOMED CENTRAL LTD    34
## 7         BMC Pregnancy Childbirth            BIOMED CENTRAL LTD    33
## 8            J. Med. Internet Res.                 other (n=100)    31
## 9            BMC Health Serv. Res.            BIOMED CENTRAL LTD    31
## 10                   Haematologica     FERRATA STORTI FOUNDATION    30
## ..                             ...                           ...   ...
## Load Data
```{r}
require(dplyr)
# load ISI spradsheet and select only columns needed
vu_amst <- read.csv("pubs_2012-14 V5 CSV.csv", header = TRUE, sep =";",
na.strings = "") %>%
select(JI, PY, SN, PU, DI, UT)
tbl_df(vu_amst)
# load DOAJ spreadsheet and select only columns needed
doaj <- httr::content(httr::GET("http://doaj.org/csv")) %>%
select(Journal.ISSN..print.version.,Journal.EISSN..online.version.,Journal.article.processing.charges..APCs.,First.calendar.year.journal.provided.online.Open.Access.content)
tbl_df(doaj)
```
## Prepare Match
```{r}
# join ISSN and EISSN into one vector
doaj.issn <- c(as.character(doaj$Journal.ISSN..print.version.),
as.character(doaj$Journal.EISSN..online.version.))
doaj.issn <- doaj.issn[!doaj.issn == ""]
# convert to class character
vu_amst$SN <- as.character(vu_amst$SN)
```
## Match
```{r}
# match with the vu dataset
vu_amst$DOAJ <- vu_amst$SN %in% doaj.issn
table(vu_amst$DOAJ)
```
## Merge
```{r}
tt <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.ISSN..print.version.")
tt_2 <- merge(vu_amst, doaj, by.x = "SN", by.y = "Journal.EISSN..online.version.")
colnames(tt_2) <- colnames(tt)
vu_doaj <- rbind(tt, tt_2)
```
It is important to check for journals that were transferred to OA and exclude those article that were published before. This is especially important for the SCOAP3 journals
```{r}
vu_doaj <- filter(vu_doaj, PY >= First.calendar.year.journal.provided.online.Open.Access.content)
```
## Explore Data
Let's drop levels not required before exploring the data
```{r}
vu_doaj <- droplevels(vu_doaj)
```
### By Year published
```{r}
# create matrix
by_year <- rbind(all = table(vu_amst$PY), doaj = table(vu_doaj$PY), share =
table(vu_doaj$PY) / table(vu_amst$PY) * 100)
# print
knitr::kable(by_year, digits = 2)
```
### By publisher
Publisher names are a bit messy in the Web of Science. Before we tabulate the OA-publishers by year, let's clean up some publisher names:
```{r}
vu_doaj$PU[grep("Wiley", vu_doaj$PU, ignore.case = T)] <- "WILEY-BLACKWELL"
vu_doaj$PU[grep("FRONTIERS", vu_doaj$PU, ignore.case = T)] <- "FRONTIERS RESEARCH FOUNDATION"
vu_doaj$PU[grep("ELSEVIER", vu_doaj$PU, ignore.case = T)] <- "ELSEVIER SCIENCE BV"
```
We've identified `r length(unique(vu_doaj$PU))` OA publishers. To calculate OA Gold publications over publishers:
```{r}
count(vu_doaj, PU) %>% arrange(desc(n))
```
Plot OA Gold publications over publishers and year published
```{r}
# take only the five most popular publishers
vu_doaj$PU <- factor(vu_doaj$PU,
levels = c(rownames(data.frame(rev(sort(table(vu_doaj$PU)))))))
levels(vu_doaj$PU)[6:length(levels(vu_doaj$PU))] <- paste("other (n=",
length(unique(vu_doaj$PU)) - 5, ")", sep= "")
require(dplyr)
publisher_by_yr <- group_by(vu_doaj, PU, PY) %>% tally()
publisher_by_yr
require(ggplot2)
ggplot(publisher_by_yr, aes(factor(PY), n, fill = PU, group = PU)) + geom_area(position = 'stack' ) + scale_fill_manual("Publisher", values = c("#f39c12", "#2980b9", "#2ecc71", "#fb8072","#ffffb3", "#bdc3c7")) +
xlab("Year") + ylab("ISI OA Gold articles") + theme_bw()
```
### By journal
```{r}
# relevel by journal
vu_doaj$JI <- factor(vu_doaj$JI,
levels = c(rownames(data.frame(rev(sort(table(vu_doaj$JI)))))))
group_by(vu_doaj, JI, PU) %>% tally()
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment