|
#' libraries needed |
|
library(tidyverse) |
|
library(jsonlite) |
|
library(tidytext) |
|
library(quanteda) |
|
#' import zenodo https://zenodo.org/record/1065507#.WsNkb9NubFw |
|
zenodo <- |
|
readr::read_tsv("data/zenodo-all-metadata-records_2017-01-17.tsv") %>% |
|
dplyr::select(title, doi) |
|
# import figshare https://figshare.com/articles/Figshare_Public_Metadata_until_02_12_2014/1320834 |
|
#' uses batches and select title and doi, |
|
#' apporach as described in ?stream_in docu because file is large |
|
con_in <- file("data/basic-metadata.json") |
|
con_out <- file(tmp <- tempfile(), open = "wb") |
|
jsonlite::stream_in( |
|
con_in, |
|
handler = function(df) { |
|
df <- dplyr::select(df, title, DOI) |
|
jsonlite::stream_out(df, con_out, pagesize = 1000) |
|
}, |
|
pagesize = 5000 |
|
) |
|
close(con_out) |
|
#' stream it from tmp file |
|
figshare <- jsonlite::stream_in(file(tmp)) |
|
#' backup |
|
jsonlite::stream_out(figshare, file("data/figshare_short.json")) |
|
#' now, let's compare word usage similar as described in the tidytextming book: |
|
#' caste study on twitter usage |
|
#' |
|
#' https://www.tidytextmining.com/twitter.html |
|
#' |
|
#' 1. Join datasets |
|
my_df <- dplyr::bind_rows( |
|
zenodo %>% |
|
select(title) %>% |
|
mutate(repo = "Zenodo"), |
|
figshare %>% |
|
select(title) %>% |
|
mutate(repo = "Figshare") |
|
) |
|
#' 2. Clean data: remove html tags in title, unnest tokes, |
|
#' remove stop words, and stem the words |
|
tidy_df <- my_df %>% |
|
mutate(title = gsub("<.*?>", "", title)) %>% |
|
mutate(title = gsub("[[:punct:]]", "", title)) %>% |
|
mutate(title = gsub("[[:digit:]]", "", title)) %>% |
|
unnest_tokens(word, title) %>% |
|
filter(!word %in% stop_words$word) %>% |
|
mutate(token_stem = quanteda::char_wordstem(word)) |
|
#' 3. Word ratios |
|
word_ratios <- tidy_df %>% |
|
count(token_stem, repo) %>% |
|
filter(sum(n) >= 10) %>% |
|
ungroup() %>% |
|
spread(repo, n, fill = 0) %>% |
|
mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>% |
|
mutate(logratio = log(Zenodo / Figshare)) %>% |
|
arrange(desc(logratio)) |
|
#' most common |
|
word_ratios %>% |
|
arrange(abs(logratio)) |
|
#' most different |
|
word_ratios %>% |
|
group_by(logratio < 0) %>% |
|
top_n(15, abs(logratio)) %>% |
|
ungroup() %>% |
|
mutate(word = reorder(token_stem, logratio)) %>% |
|
ggplot(aes(word, logratio, fill = logratio < 0)) + |
|
geom_col(show.legend = TRUE) + |
|
coord_flip() + |
|
ylab("log odds ratio (Zenodo / Figshare Titles)") + |
|
scale_fill_manual( |
|
name = "Repository", |
|
labels = c("Zenodo", "Figshare"), |
|
values = c("#30638E", "#EDAE49") |
|
) + |
|
theme_minimal(base_family = "Arial Narrow") + |
|
labs(title = "Comparing word usage in record titles between Zenodo and Figshare") |
|
ggsave("log_words.png", dpi = 300) |
|
|
|
#' frequencies |
|
frequency_df <- tidy_df %>% |
|
group_by(repo) %>% |
|
count(token_stem, sort = TRUE) %>% |
|
left_join(my_df %>% |
|
group_by(repo) %>% |
|
summarise(total = n())) %>% |
|
mutate(freq = n / total) |
|
frequency_df %>% |
|
group_by(repo) %>% |
|
top_n(15, freq) %>% |
|
ggplot(aes(token_stem, freq, fill = repo)) + |
|
geom_bar(stat = "identity") + |
|
coord_flip() + |
|
facet_wrap( ~ repo) + |
|
scale_fill_manual(name = "Repository", |
|
labels = rev(c("Zenodo", "Figshare")), |
|
values = rev(c("#30638E", "#EDAE49"))) + |
|
labs(title = "Most used title words (stemmed)", x = "Words (stemmed)", y = "Proportion") + |
|
theme_minimal(base_family = "Arial Narrow") |
|
ggsave("frequecy_words.png", dpi = 300) |