Created
February 19, 2019 05:24
-
-
Save sneakers-the-rat/f63705bf4994cfc001abead5dd905447 to your computer and use it in GitHub Desktop.
cleaning up a whitespace delimited table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(dplyr) | |
library(stringr) | |
m200 <- read_html("http://www.alltime-athletics.com/m_100ok.htm") | |
m2_table <- m200 %>% | |
html_nodes("body > center:nth-child(8) > pre") | |
# make list of rows, split by newline character | |
m2_table <- m2_table %>% | |
as.character() %>% | |
str_replace_all("\\\\","") %>% # remove literal backslashes | |
str_split(., '\n') # split by newline | |
# split rows in list by whitespace (>2 consecutive) | |
m2_mat <- sapply(m2_table[[1]], strsplit, split="\\s{2,}") | |
# combine rows and columns to dataframe | |
m2_df <- do.call(rbind.data.frame, m2_mat) | |
# get rid of broken row and column | |
m2_df <- m2_df[-1,-1] | |
#rename reclass | |
names(m2_df) <- c("place", "time", "delta", "runner", "country", "dob", "unknown", "olympic_city", "date") | |
m2_df$time <- as.numeric(m2_df$time) | |
m2_df$dob <- as.POSIXct(m2_df$dob, format="%d.%m.%y") | |
m2_df$date <- as.POSIXct(m2_df$date, format="%d.%m.%Y") | |
saveRDS(m2_df, "./olympics.RData") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment