Created
October 12, 2021 00:27
-
-
Save mbjones/497eea13546492ad2751f5b3b98fb13e to your computer and use it in GitHub Desktop.
Abstract lengths in DataONE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "abstracts" | |
author: "Matt Jones" | |
date: "10/11/2021" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
library(vroom) | |
library(dplyr) | |
library(ggplot2) | |
``` | |
## Abstract Lengths | |
<!-- | |
for start in `seq 0 10000 850000`; do echo $start; curl "https://cn.dataone.org/cn/v2/query/solr/?q=formatType:METADATA+AND+-obsoletedBy:*&fl=identifier,abstract&wt=json&rows=10000&start=${start}" | jq '.response.docs[] | .abstract' |awk -F\" '{print length($2)}' |sort -n > abstract-${start}.txt; done; cat abstract-*.txt |sort -n > abstractlengths.csv | |
--> | |
```{r} | |
abstract_lengths <- vroom("abstractlengths.csv", delim = ",", col_names = c("abs_length"), show_col_types = FALSE) | |
abstract_lengths %>% | |
filter(abs_length == 0) %>% | |
nrow() | |
abstract_lengths %>% | |
filter(abs_length <= 50) %>% | |
nrow() | |
abstract_lengths %>% | |
filter(abs_length >= 5000) %>% | |
nrow() | |
abstract_lengths %>% | |
filter(abs_length >= 50 & abs_length <= 5000) %>% | |
nrow() / nrow(abstract_lengths) | |
``` | |
```{r} | |
ggplot(data=filter(abstract_lengths, abs_length > 4500), mapping=aes(x=abs_length)) + | |
geom_histogram() + | |
theme_bw() | |
``` | |
```{r} | |
abstract_lengths %>% | |
filter(abs_length >= 5000) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment