Skip to content

Instantly share code, notes, and snippets.

@muschellij2
Last active October 22, 2018 14:30
Show Gist options
  • Save muschellij2/256fe5b49e9b15e165a9a03213c7c9d2 to your computer and use it in GitHub Desktop.
Save muschellij2/256fe5b49e9b15e165a9a03213c7c9d2 to your computer and use it in GitHub Desktop.
Mime Helpers
library(rvest)
library(dplyr)
library(tidyr)
url = "https://www.freeformatter.com/mime-types-list.html"
doc = read_html(url)
#############################
# Read in the table
#############################
tab = html_table(doc)
stopifnot(length(tab) == 1)
tab = tab[[1]]
tab = tab %>%
rename(name = Name,
mime_type = `MIME Type / Internet Media Type`,
ext = `File Extension`,
details = `More Details`) %>%
mutate(
mime_type = sub(",$", "", mime_type),
ext = ifelse(ext %in% "N/A", NA, ext)
)
n_missing = sum(is.na(tab$mime_type))
stopifnot(n_missing == 0)
#################################
# Few cases with multiple extensions
#################################
n_ext = strsplit(tab$ext, split = ",")
n_ext = sapply(n_ext, length)
tab$n_ext = n_ext
stopifnot(max(n_ext) == 2)
tab = tab %>%
separate(ext, into = c("ext_1", "ext_2"),
sep = ", ", fill = "right")
tab = tab %>%
gather(key = ext_number, value = ext, ext_1, ext_2) %>%
mutate(ext_number = sub("ext_", "", ext_number),
ext_number = as.numeric(ext_number)
) %>%
filter(ext_number <= n_ext) %>%
arrange(mime_type, ext_number)
tab = tab %>%
select(-ext_number)
# test case
tab[ grepl("atom", tab$mime_type),]
tab[ tab$n_ext > 1,]
#################################
# Few cases with multiple mime types
#################################
n_mime = strsplit(tab$mime_type, split = ",")
n_mime = sapply(n_mime, length)
tab$n_mime = n_mime
stopifnot(max(n_mime) == 2)
tab = tab %>%
separate(mime_type, into = c("mime_1", "mime_2"),
sep = ",", fill = "right")
tab = tab %>%
gather(key = mime_number, value = mime_type, mime_1, mime_2) %>%
mutate(mime_number = sub("mime_", "", mime_number),
mime_number = as.numeric(mime_number)
) %>%
filter(mime_number <= n_mime) %>%
arrange(mime_type, mime_number)
tab = tab %>%
select(-mime_number, -n_mime)
# test case
tab[ grepl("java", tab$mime_type),]
tab[ is.na(tab$mime_type),]
tab = tab %>%
filter(!is.na(mime_type),
!is.na(ext),
!ext %in% "")
tab = tab %>%
select(mime_type, ext)
first_tab = tab
url = "https://www.sitepoint.com/mime-types-complete-list/"
doc = read_html(url)
#############################
# Read in the table
#############################
tab = html_table(doc)
stopifnot(length(tab) == 1)
tab = tab[[1]]
tab = tab %>%
rename(ext = `Suffixes applicable`,
mime_type = `Media type and subtype(s)`)
have_spaces = grepl(" ", tab$mime_type)
stopifnot(sum(have_spaces) == 2)
tab = tab %>%
mutate(
mime_type = sub(" .*", "", mime_type),
ext = ifelse(ext %in% "N/A", NA, ext)
)
n_missing = sum(is.na(tab$mime_type))
stopifnot(n_missing == 0)
tab = full_join(tab, first_tab)
# both_tab =
####################################
# Another set of types
####################################
url = "https://raw.githubusercontent.com/hoaproject/Mime/master/Mime.types"
doc = readLines(url)
doc = doc[ !grepl("^#", doc)]
doc = doc[ !(doc %in% "") ]
doc = gsub("\t+", "\t", doc)
df = strsplit(doc, "\t")
df = lapply(df, function(x) {
mime_type = x[1]
ext = NA
if (length(x) > 1) {
xx = x[-1]
ext = unlist(strsplit(xx, split = " "))
}
data_frame(ext = ext,
mime_type = mime_type)
})
df = bind_rows(df)
df = df %>%
filter(!is.na(ext),
!is.na(mime_type))
tab = full_join(tab, df)
mime_df = data_frame(
mime_type = mime::mimemap,
ext = paste0(".", names(mime::mimemap))
)
mime_extensions = full_join(tab, mime_df)
mime_extensions[ mime_extensions$mime_type == "text/plain",]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment