Last active
February 3, 2022 17:54
-
-
Save ctesta01/dca499202d83836c5cd696734bf95b8d to your computer and use it in GitHub Desktop.
Black History Milestones Scraped from History.com and Visualized
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(ggplot2) | |
| library(magrittr) | |
| library(rvest) | |
| library(dplyr) | |
| # url to fetch html from | |
| html_data <- "https://www.history.com/topics/black-history/black-history-milestones" | |
| # parse html | |
| html_text <- read_html(html_data) | |
| # select h2 elements | |
| headings <- html_text %>% html_elements('h2') | |
| # get the id of the h2 elements | |
| heading_ids <- headings %>% html_attr('id') | |
| # determine which headings have a "section_" part to their id | |
| which_heading_ids <- | |
| which( | |
| heading_ids %in% c( | |
| paste0('section_', 1:44), | |
| 'the-black-lives-matter-movement', | |
| 'george-floyd-protests', | |
| 'kamala-harris-becomes-the-first-woman-and-first-black-us-vice-president-2021' | |
| ) | |
| ) | |
| # those are the headings we want, as they have the event text of interest | |
| headings <- headings[which_heading_ids] | |
| # extract heading text | |
| headings_text <- headings %>% html_text() | |
| # use str_trim to remove excess whitespace | |
| headings_text %<>% stringr::str_trim() | |
| # extract heading dates - this uses regular expressions to read off any | |
| # 4-number (e.g. YYYY) format dates | |
| headings_dates <- stringr::str_extract(headings_text, "[0-9]{4}") | |
| # manual coding for some entries | |
| headings_dates[headings_text == 'Rise of Black Power'] <- 1969 | |
| headings_dates[headings_text == 'The Black Lives Matter Movement'] <- 2013 | |
| headings_dates[headings_text == 'George Floyd Protests'] <- 2020 | |
| # manual coding for some entries | |
| headings_text[headings_text == 'Rise of Black Power'] <- 'Rise of Black Power, late 1960s' | |
| headings_text[headings_text == 'The Black Lives Matter Movement'] <- 'The Black Lives Matter Movement, 2013' | |
| headings_text[headings_text == 'George Floyd Protests'] <- 'George Floyd Protests, 2020' | |
| # construct a tibble with the dates and text | |
| df <- tibble::tibble( | |
| year = as.numeric(headings_dates), | |
| event = headings_text) | |
| # put an alternating sequence into the data frame to help position the timeline | |
| # entries alternating either side, so they're more spaced out | |
| df %<>% | |
| mutate( | |
| row_number = row_number(), | |
| parity = row_number %% 2, | |
| label_position = parity * 5 - 2.5 + rnorm(n = nrow(.))) | |
| # if the labels are too close to the 0 midline, it's visually disruptive, so | |
| # move them further away | |
| df %<>% mutate( | |
| label_position = ifelse(abs(label_position) < 2, sign(label_position)*2, label_position)) | |
| # plot our data | |
| df %>% | |
| ggplot(aes(y = year, x = 0, label = event, color = factor(row_number), fill = factor(row_number))) + | |
| geom_segment( | |
| y = min(.data$year), | |
| yend = max(.data$year), | |
| x = 0, | |
| xend = 0, | |
| color = 'grey80') + | |
| geom_point() + | |
| geom_point(aes(x = label_position)) + | |
| geom_segment(aes(xend = label_position, yend = year)) + | |
| ggrepel::geom_label_repel(mapping = aes(x = label_position), size = 2.25, color = 'white', segment.color = 'grey80') + | |
| scale_fill_viridis_d(end=0.6, option = 'magma') + | |
| scale_color_viridis_d(end=0.6, option = 'magma') + | |
| scale_y_reverse(breaks = seq(1600, 2000, by = 50), position = 'right') + | |
| theme_bw() + | |
| expand_limits(x = c(-3,3)) + | |
| ggtitle("Black History Milestones in the United States", | |
| "Adapted from https://www.history.com/topics/black-history/black-history-milestones") + | |
| labs(caption = "Milestones Based on https://www.history.com/topics/black-history/black-history-milestones") + | |
| theme(legend.position = 'none', panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), | |
| plot.title = element_text(hjust = 0.5), | |
| axis.title.x = element_blank(), axis.ticks.x = element_blank(), | |
| axis.text.x = element_blank()) | |
| # the visualization is intended as a tall, long piece | |
| ggsave("black_history.png", height = 35, width = 8) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I really wanted to do something like this given that next week it will be Black History Month and I have been wanting to get more familiar with the
rvestpackage for scraping data. I like the long timeline format, which reminds me of https://xkcd.com/1732/.The original History.com article is available here, but I would be interested in finding a better source or curating a list of events more carefully.