Last active
October 22, 2018 14:30
-
-
Save muschellij2/256fe5b49e9b15e165a9a03213c7c9d2 to your computer and use it in GitHub Desktop.
Mime Helpers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(dplyr) | |
library(tidyr) | |
url = "https://www.freeformatter.com/mime-types-list.html" | |
doc = read_html(url) | |
############################# | |
# Read in the table | |
############################# | |
tab = html_table(doc) | |
stopifnot(length(tab) == 1) | |
tab = tab[[1]] | |
tab = tab %>% | |
rename(name = Name, | |
mime_type = `MIME Type / Internet Media Type`, | |
ext = `File Extension`, | |
details = `More Details`) %>% | |
mutate( | |
mime_type = sub(",$", "", mime_type), | |
ext = ifelse(ext %in% "N/A", NA, ext) | |
) | |
n_missing = sum(is.na(tab$mime_type)) | |
stopifnot(n_missing == 0) | |
################################# | |
# Few cases with multiple extensions | |
################################# | |
n_ext = strsplit(tab$ext, split = ",") | |
n_ext = sapply(n_ext, length) | |
tab$n_ext = n_ext | |
stopifnot(max(n_ext) == 2) | |
tab = tab %>% | |
separate(ext, into = c("ext_1", "ext_2"), | |
sep = ", ", fill = "right") | |
tab = tab %>% | |
gather(key = ext_number, value = ext, ext_1, ext_2) %>% | |
mutate(ext_number = sub("ext_", "", ext_number), | |
ext_number = as.numeric(ext_number) | |
) %>% | |
filter(ext_number <= n_ext) %>% | |
arrange(mime_type, ext_number) | |
tab = tab %>% | |
select(-ext_number) | |
# test case | |
tab[ grepl("atom", tab$mime_type),] | |
tab[ tab$n_ext > 1,] | |
################################# | |
# Few cases with multiple mime types | |
################################# | |
n_mime = strsplit(tab$mime_type, split = ",") | |
n_mime = sapply(n_mime, length) | |
tab$n_mime = n_mime | |
stopifnot(max(n_mime) == 2) | |
tab = tab %>% | |
separate(mime_type, into = c("mime_1", "mime_2"), | |
sep = ",", fill = "right") | |
tab = tab %>% | |
gather(key = mime_number, value = mime_type, mime_1, mime_2) %>% | |
mutate(mime_number = sub("mime_", "", mime_number), | |
mime_number = as.numeric(mime_number) | |
) %>% | |
filter(mime_number <= n_mime) %>% | |
arrange(mime_type, mime_number) | |
tab = tab %>% | |
select(-mime_number, -n_mime) | |
# test case | |
tab[ grepl("java", tab$mime_type),] | |
tab[ is.na(tab$mime_type),] | |
tab = tab %>% | |
filter(!is.na(mime_type), | |
!is.na(ext), | |
!ext %in% "") | |
tab = tab %>% | |
select(mime_type, ext) | |
first_tab = tab | |
url = "https://www.sitepoint.com/mime-types-complete-list/" | |
doc = read_html(url) | |
############################# | |
# Read in the table | |
############################# | |
tab = html_table(doc) | |
stopifnot(length(tab) == 1) | |
tab = tab[[1]] | |
tab = tab %>% | |
rename(ext = `Suffixes applicable`, | |
mime_type = `Media type and subtype(s)`) | |
have_spaces = grepl(" ", tab$mime_type) | |
stopifnot(sum(have_spaces) == 2) | |
tab = tab %>% | |
mutate( | |
mime_type = sub(" .*", "", mime_type), | |
ext = ifelse(ext %in% "N/A", NA, ext) | |
) | |
n_missing = sum(is.na(tab$mime_type)) | |
stopifnot(n_missing == 0) | |
tab = full_join(tab, first_tab) | |
# both_tab = | |
#################################### | |
# Another set of types | |
#################################### | |
url = "https://raw.githubusercontent.com/hoaproject/Mime/master/Mime.types" | |
doc = readLines(url) | |
doc = doc[ !grepl("^#", doc)] | |
doc = doc[ !(doc %in% "") ] | |
doc = gsub("\t+", "\t", doc) | |
df = strsplit(doc, "\t") | |
df = lapply(df, function(x) { | |
mime_type = x[1] | |
ext = NA | |
if (length(x) > 1) { | |
xx = x[-1] | |
ext = unlist(strsplit(xx, split = " ")) | |
} | |
data_frame(ext = ext, | |
mime_type = mime_type) | |
}) | |
df = bind_rows(df) | |
df = df %>% | |
filter(!is.na(ext), | |
!is.na(mime_type)) | |
tab = full_join(tab, df) | |
mime_df = data_frame( | |
mime_type = mime::mimemap, | |
ext = paste0(".", names(mime::mimemap)) | |
) | |
mime_extensions = full_join(tab, mime_df) | |
mime_extensions[ mime_extensions$mime_type == "text/plain",] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment