Skip to content

Instantly share code, notes, and snippets.

@njahn82
Created April 3, 2018 13:03
Show Gist options
  • Save njahn82/60a32a0eb5b66a0579de34a3f7bd2995 to your computer and use it in GitHub Desktop.
Save njahn82/60a32a0eb5b66a0579de34a3f7bd2995 to your computer and use it in GitHub Desktop.
Compare word usage in Zenodo and Figshare titles
#' libraries needed
library(tidyverse)
library(jsonlite)
library(tidytext)
library(quanteda)
#' import zenodo https://zenodo.org/record/1065507#.WsNkb9NubFw
zenodo <-
readr::read_tsv("data/zenodo-all-metadata-records_2017-01-17.tsv") %>%
dplyr::select(title, doi)
# import figshare https://figshare.com/articles/Figshare_Public_Metadata_until_02_12_2014/1320834
#' uses batches and select title and doi,
#' apporach as described in ?stream_in docu because file is large
con_in <- file("data/basic-metadata.json")
con_out <- file(tmp <- tempfile(), open = "wb")
jsonlite::stream_in(
con_in,
handler = function(df) {
df <- dplyr::select(df, title, DOI)
jsonlite::stream_out(df, con_out, pagesize = 1000)
},
pagesize = 5000
)
close(con_out)
#' stream it from tmp file
figshare <- jsonlite::stream_in(file(tmp))
#' backup
jsonlite::stream_out(figshare, file("data/figshare_short.json"))
#' now, let's compare word usage similar as described in the tidytextming book:
#' caste study on twitter usage
#'
#' https://www.tidytextmining.com/twitter.html
#'
#' 1. Join datasets
my_df <- dplyr::bind_rows(
zenodo %>%
select(title) %>%
mutate(repo = "Zenodo"),
figshare %>%
select(title) %>%
mutate(repo = "Figshare")
)
#' 2. Clean data: remove html tags in title, unnest tokes,
#' remove stop words, and stem the words
tidy_df <- my_df %>%
mutate(title = gsub("<.*?>", "", title)) %>%
mutate(title = gsub("[[:punct:]]", "", title)) %>%
mutate(title = gsub("[[:digit:]]", "", title)) %>%
unnest_tokens(word, title) %>%
filter(!word %in% stop_words$word) %>%
mutate(token_stem = quanteda::char_wordstem(word))
#' 3. Word ratios
word_ratios <- tidy_df %>%
count(token_stem, repo) %>%
filter(sum(n) >= 10) %>%
ungroup() %>%
spread(repo, n, fill = 0) %>%
mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
mutate(logratio = log(Zenodo / Figshare)) %>%
arrange(desc(logratio))
#' most common
word_ratios %>%
arrange(abs(logratio))
#' most different
word_ratios %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(token_stem, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col(show.legend = TRUE) +
coord_flip() +
ylab("log odds ratio (Zenodo / Figshare Titles)") +
scale_fill_manual(
name = "Repository",
labels = c("Zenodo", "Figshare"),
values = c("#30638E", "#EDAE49")
) +
theme_minimal(base_family = "Arial Narrow") +
labs(title = "Comparing word usage in record titles between Zenodo and Figshare")
ggsave("log_words.png", dpi = 300)
#' frequencies
frequency_df <- tidy_df %>%
group_by(repo) %>%
count(token_stem, sort = TRUE) %>%
left_join(my_df %>%
group_by(repo) %>%
summarise(total = n())) %>%
mutate(freq = n / total)
frequency_df %>%
group_by(repo) %>%
top_n(15, freq) %>%
ggplot(aes(token_stem, freq, fill = repo)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap( ~ repo) +
scale_fill_manual(name = "Repository",
labels = rev(c("Zenodo", "Figshare")),
values = rev(c("#30638E", "#EDAE49"))) +
labs(title = "Most used title words (stemmed)", x = "Words (stemmed)", y = "Proportion") +
theme_minimal(base_family = "Arial Narrow")
ggsave("frequecy_words.png", dpi = 300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment