njahn82 · April 3, 2018 13:03
diff --git a/figshare_zenodo.r b/figshare_zenodo.r
 #' libraries needed
 library(tidyverse)
 library(jsonlite)
 library(tidytext)
 library(quanteda)
 #' import zenodo https://zenodo.org/record/1065507#.WsNkb9NubFw
 zenodo <-
  readr::read_tsv("data/zenodo-all-metadata-records_2017-01-17.tsv") %>%
  dplyr::select(title, doi)
 # import figshare https://figshare.com/articles/Figshare_Public_Metadata_until_02_12_2014/1320834
 #' uses batches and select title and doi,
 #' apporach as described in ?stream_in docu because file is large
 con_in <- file("data/basic-metadata.json")
 con_out <- file(tmp <- tempfile(), open = "wb")
 jsonlite::stream_in(
  con_in,
  handler = function(df) {
    df <- dplyr::select(df, title, DOI)
    jsonlite::stream_out(df, con_out, pagesize = 1000)
  },
  pagesize = 5000
 )
 close(con_out)
 #' stream it from tmp file
 figshare <- jsonlite::stream_in(file(tmp))
 #' backup
 jsonlite::stream_out(figshare, file("data/figshare_short.json"))
 #' now, let's compare word usage similar as described in the tidytextming book:
 #' caste study on twitter usage
 #'
 #' https://www.tidytextmining.com/twitter.html
 #'
 #' 1. Join datasets
 my_df <- dplyr::bind_rows(
  zenodo %>%
    select(title) %>%
    mutate(repo = "Zenodo"),
  figshare %>%
    select(title) %>%
    mutate(repo = "Figshare")
 )
 #'  2. Clean data: remove html tags in title, unnest tokes,
 #'  remove stop words, and stem the words
 tidy_df <- my_df %>%
  mutate(title = gsub("<.*?>", "", title)) %>%
  mutate(title = gsub("[[:punct:]]", "", title)) %>%
  mutate(title = gsub("[[:digit:]]", "", title)) %>%
  unnest_tokens(word, title) %>%
  filter(!word %in% stop_words$word) %>%
  mutate(token_stem = quanteda::char_wordstem(word))
 #' 3. Word ratios
 word_ratios <- tidy_df %>%
  count(token_stem, repo) %>%
  filter(sum(n) >= 10) %>%
  ungroup() %>%
  spread(repo, n, fill = 0) %>%
  mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
  mutate(logratio = log(Zenodo / Figshare)) %>%
  arrange(desc(logratio))
 #' most common
 word_ratios %>%
  arrange(abs(logratio))
 #' most different
 word_ratios %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(token_stem, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = TRUE) +
  coord_flip() +
  ylab("log odds ratio (Zenodo / Figshare Titles)") +
  scale_fill_manual(
    name = "Repository",
    labels = c("Zenodo", "Figshare"),
    values = c("#30638E", "#EDAE49")
  ) +
  theme_minimal(base_family = "Arial Narrow") +
  labs(title = "Comparing word usage in record titles between Zenodo and Figshare")
 ggsave("log_words.png", dpi = 300)

 #' frequencies
 frequency_df <- tidy_df %>%
  group_by(repo) %>%
  count(token_stem, sort = TRUE) %>%
  left_join(my_df %>%
              group_by(repo) %>%
              summarise(total = n())) %>%
  mutate(freq = n / total)
 frequency_df %>%
  group_by(repo) %>%
  top_n(15, freq) %>%
  ggplot(aes(token_stem, freq, fill = repo)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  facet_wrap( ~ repo) +
  scale_fill_manual(name = "Repository",
                    labels = rev(c("Zenodo", "Figshare")),
                    values = rev(c("#30638E", "#EDAE49"))) +
  labs(title = "Most used title words (stemmed)", x = "Words (stemmed)", y = "Proportion") +
  theme_minimal(base_family = "Arial Narrow")
 ggsave("frequecy_words.png", dpi = 300)
	#' libraries needed
	library(tidyverse)
	library(jsonlite)
	library(tidytext)
	library(quanteda)
	#' import zenodo https://zenodo.org/record/1065507#.WsNkb9NubFw
	zenodo <-
	readr::read_tsv("data/zenodo-all-metadata-records_2017-01-17.tsv") %>%
	dplyr::select(title, doi)
	# import figshare https://figshare.com/articles/Figshare_Public_Metadata_until_02_12_2014/1320834
	#' uses batches and select title and doi,
	#' apporach as described in ?stream_in docu because file is large
	con_in <- file("data/basic-metadata.json")
	con_out <- file(tmp <- tempfile(), open = "wb")
	jsonlite::stream_in(
	con_in,
	handler = function(df) {
	df <- dplyr::select(df, title, DOI)
	jsonlite::stream_out(df, con_out, pagesize = 1000)
	},
	pagesize = 5000
	)
	close(con_out)
	#' stream it from tmp file
	figshare <- jsonlite::stream_in(file(tmp))
	#' backup
	jsonlite::stream_out(figshare, file("data/figshare_short.json"))
	#' now, let's compare word usage similar as described in the tidytextming book:
	#' caste study on twitter usage
	#'
	#' https://www.tidytextmining.com/twitter.html
	#'
	#' 1. Join datasets
	my_df <- dplyr::bind_rows(
	zenodo %>%
	select(title) %>%
	mutate(repo = "Zenodo"),
	figshare %>%
	select(title) %>%
	mutate(repo = "Figshare")
	)
	#' 2. Clean data: remove html tags in title, unnest tokes,
	#' remove stop words, and stem the words
	tidy_df <- my_df %>%
	mutate(title = gsub("<.*?>", "", title)) %>%
	mutate(title = gsub("[[:punct:]]", "", title)) %>%
	mutate(title = gsub("[[:digit:]]", "", title)) %>%
	unnest_tokens(word, title) %>%
	filter(!word %in% stop_words$word) %>%
	mutate(token_stem = quanteda::char_wordstem(word))
	#' 3. Word ratios
	word_ratios <- tidy_df %>%
	count(token_stem, repo) %>%
	filter(sum(n) >= 10) %>%
	ungroup() %>%
	spread(repo, n, fill = 0) %>%
	mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
	mutate(logratio = log(Zenodo / Figshare)) %>%
	arrange(desc(logratio))
	#' most common
	word_ratios %>%
	arrange(abs(logratio))
	#' most different
	word_ratios %>%
	group_by(logratio < 0) %>%
	top_n(15, abs(logratio)) %>%
	ungroup() %>%
	mutate(word = reorder(token_stem, logratio)) %>%
	ggplot(aes(word, logratio, fill = logratio < 0)) +
	geom_col(show.legend = TRUE) +
	coord_flip() +
	ylab("log odds ratio (Zenodo / Figshare Titles)") +
	scale_fill_manual(
	name = "Repository",
	labels = c("Zenodo", "Figshare"),
	values = c("#30638E", "#EDAE49")
	) +
	theme_minimal(base_family = "Arial Narrow") +
	labs(title = "Comparing word usage in record titles between Zenodo and Figshare")
	ggsave("log_words.png", dpi = 300)

	#' frequencies
	frequency_df <- tidy_df %>%
	group_by(repo) %>%
	count(token_stem, sort = TRUE) %>%
	left_join(my_df %>%
	group_by(repo) %>%
	summarise(total = n())) %>%
	mutate(freq = n / total)
	frequency_df %>%
	group_by(repo) %>%
	top_n(15, freq) %>%
	ggplot(aes(token_stem, freq, fill = repo)) +
	geom_bar(stat = "identity") +
	coord_flip() +
	facet_wrap( ~ repo) +
	scale_fill_manual(name = "Repository",
	labels = rev(c("Zenodo", "Figshare")),
	values = rev(c("#30638E", "#EDAE49"))) +
	labs(title = "Most used title words (stemmed)", x = "Words (stemmed)", y = "Proportion") +
	theme_minimal(base_family = "Arial Narrow")
	ggsave("frequecy_words.png", dpi = 300)