muschellij2 · October 22, 2018 14:30
diff --git a/mime_type_extensions.R b/mime_type_extensions.R
 library(rvest)
 library(dplyr)
 library(tidyr)


 url = "https://www.freeformatter.com/mime-types-list.html"
 doc = read_html(url)

 #############################
 # Read in the table
 #############################
 tab = html_table(doc)
 stopifnot(length(tab) == 1)
 tab = tab[[1]]
 tab = tab %>% 
  rename(name = Name,
         mime_type = `MIME Type / Internet Media Type`,
         ext = `File Extension`,
         details = `More Details`) %>% 
  mutate(
    mime_type = sub(",$", "", mime_type),
    ext = ifelse(ext %in% "N/A", NA, ext)
  ) 
 n_missing = sum(is.na(tab$mime_type))
 stopifnot(n_missing == 0)

 #################################
 # Few cases with multiple extensions
 #################################
 n_ext = strsplit(tab$ext, split = ",")
 n_ext = sapply(n_ext, length)
 tab$n_ext = n_ext
 stopifnot(max(n_ext) == 2)
 tab = tab %>% 
  separate(ext, into = c("ext_1", "ext_2"), 
           sep = ", ", fill = "right")
 tab = tab %>% 
  gather(key = ext_number, value = ext, ext_1, ext_2) %>% 
  mutate(ext_number = sub("ext_", "", ext_number),
         ext_number = as.numeric(ext_number)
         ) %>% 
  filter(ext_number <= n_ext) %>% 
  arrange(mime_type, ext_number)
 tab = tab %>% 
  select(-ext_number)
 # test case
 tab[ grepl("atom", tab$mime_type),]
 tab[ tab$n_ext > 1,]


 #################################
 # Few cases with multiple mime types
 #################################
 n_mime = strsplit(tab$mime_type, split = ",")
 n_mime = sapply(n_mime, length)
 tab$n_mime = n_mime
 stopifnot(max(n_mime) == 2)
 tab = tab %>% 
  separate(mime_type, into = c("mime_1", "mime_2"), 
           sep = ",", fill = "right")
 tab = tab %>% 
  gather(key = mime_number, value = mime_type, mime_1, mime_2) %>% 
  mutate(mime_number = sub("mime_", "", mime_number),
         mime_number = as.numeric(mime_number)
  ) %>% 
  filter(mime_number <= n_mime) %>% 
  arrange(mime_type, mime_number)
 tab = tab %>% 
  select(-mime_number, -n_mime)

 # test case
 tab[ grepl("java", tab$mime_type),]

 tab[ is.na(tab$mime_type),]

 tab = tab %>% 
  filter(!is.na(mime_type),
         !is.na(ext),
         !ext %in% "")

 tab = tab %>% 
  select(mime_type, ext)
 first_tab = tab

 url = "https://www.sitepoint.com/mime-types-complete-list/"
 doc = read_html(url)

 #############################
 # Read in the table
 #############################
 tab = html_table(doc)
 stopifnot(length(tab) == 1)
 tab = tab[[1]]
 tab = tab %>% 
  rename(ext = `Suffixes applicable`,
         mime_type = `Media type and subtype(s)`)
 have_spaces = grepl(" ", tab$mime_type)
 stopifnot(sum(have_spaces) == 2)
 tab = tab %>% 
  mutate(
    mime_type = sub(" .*", "", mime_type),
    ext = ifelse(ext %in% "N/A", NA, ext)
  ) 
 n_missing = sum(is.na(tab$mime_type))
 stopifnot(n_missing == 0)

 tab = full_join(tab, first_tab)
 # both_tab = 

 ####################################
 # Another set of types
 ####################################
 url = "https://raw.githubusercontent.com/hoaproject/Mime/master/Mime.types"
 doc = readLines(url)

 doc = doc[ !grepl("^#", doc)]
 doc = doc[ !(doc %in% "") ]
 doc = gsub("\t+", "\t", doc)
 df = strsplit(doc, "\t")
 df = lapply(df, function(x) {
  mime_type = x[1]
  ext = NA
  if (length(x) > 1) {
    xx = x[-1]
    ext = unlist(strsplit(xx, split = " "))
  }
  data_frame(ext = ext,
             mime_type = mime_type)
 })
 df = bind_rows(df)
 df = df %>% 
  filter(!is.na(ext),
         !is.na(mime_type))
 tab = full_join(tab, df)

 mime_df = data_frame(
  mime_type = mime::mimemap,
  ext = paste0(".", names(mime::mimemap))
 )

 mime_extensions = full_join(tab, mime_df)
 mime_extensions[ mime_extensions$mime_type == "text/plain",]
	library(rvest)
	library(dplyr)
	library(tidyr)


	url = "https://www.freeformatter.com/mime-types-list.html"
	doc = read_html(url)

	#############################
	# Read in the table
	#############################
	tab = html_table(doc)
	stopifnot(length(tab) == 1)
	tab = tab[[1]]
	tab = tab %>%
	rename(name = Name,
	mime_type = `MIME Type / Internet Media Type`,
	ext = `File Extension`,
	details = `More Details`) %>%
	mutate(
	mime_type = sub(",$", "", mime_type),
	ext = ifelse(ext %in% "N/A", NA, ext)
	)
	n_missing = sum(is.na(tab$mime_type))
	stopifnot(n_missing == 0)

	#################################
	# Few cases with multiple extensions
	#################################
	n_ext = strsplit(tab$ext, split = ",")
	n_ext = sapply(n_ext, length)
	tab$n_ext = n_ext
	stopifnot(max(n_ext) == 2)
	tab = tab %>%
	separate(ext, into = c("ext_1", "ext_2"),
	sep = ", ", fill = "right")
	tab = tab %>%
	gather(key = ext_number, value = ext, ext_1, ext_2) %>%
	mutate(ext_number = sub("ext_", "", ext_number),
	ext_number = as.numeric(ext_number)
	) %>%
	filter(ext_number <= n_ext) %>%
	arrange(mime_type, ext_number)
	tab = tab %>%
	select(-ext_number)
	# test case
	tab[ grepl("atom", tab$mime_type),]
	tab[ tab$n_ext > 1,]


	#################################
	# Few cases with multiple mime types
	#################################
	n_mime = strsplit(tab$mime_type, split = ",")
	n_mime = sapply(n_mime, length)
	tab$n_mime = n_mime
	stopifnot(max(n_mime) == 2)
	tab = tab %>%
	separate(mime_type, into = c("mime_1", "mime_2"),
	sep = ",", fill = "right")
	tab = tab %>%
	gather(key = mime_number, value = mime_type, mime_1, mime_2) %>%
	mutate(mime_number = sub("mime_", "", mime_number),
	mime_number = as.numeric(mime_number)
	) %>%
	filter(mime_number <= n_mime) %>%
	arrange(mime_type, mime_number)
	tab = tab %>%
	select(-mime_number, -n_mime)

	# test case
	tab[ grepl("java", tab$mime_type),]

	tab[ is.na(tab$mime_type),]

	tab = tab %>%
	filter(!is.na(mime_type),
	!is.na(ext),
	!ext %in% "")

	tab = tab %>%
	select(mime_type, ext)
	first_tab = tab

	url = "https://www.sitepoint.com/mime-types-complete-list/"
	doc = read_html(url)

	#############################
	# Read in the table
	#############################
	tab = html_table(doc)
	stopifnot(length(tab) == 1)
	tab = tab[[1]]
	tab = tab %>%
	rename(ext = `Suffixes applicable`,
	mime_type = `Media type and subtype(s)`)
	have_spaces = grepl(" ", tab$mime_type)
	stopifnot(sum(have_spaces) == 2)
	tab = tab %>%
	mutate(
	mime_type = sub(" .*", "", mime_type),
	ext = ifelse(ext %in% "N/A", NA, ext)
	)
	n_missing = sum(is.na(tab$mime_type))
	stopifnot(n_missing == 0)

	tab = full_join(tab, first_tab)
	# both_tab =

	####################################
	# Another set of types
	####################################
	url = "https://raw.githubusercontent.com/hoaproject/Mime/master/Mime.types"
	doc = readLines(url)

	doc = doc[ !grepl("^#", doc)]
	doc = doc[ !(doc %in% "") ]
	doc = gsub("\t+", "\t", doc)
	df = strsplit(doc, "\t")
	df = lapply(df, function(x) {
	mime_type = x[1]
	ext = NA
	if (length(x) > 1) {
	xx = x[-1]
	ext = unlist(strsplit(xx, split = " "))
	}
	data_frame(ext = ext,
	mime_type = mime_type)
	})
	df = bind_rows(df)
	df = df %>%
	filter(!is.na(ext),
	!is.na(mime_type))
	tab = full_join(tab, df)

	mime_df = data_frame(
	mime_type = mime::mimemap,
	ext = paste0(".", names(mime::mimemap))
	)

	mime_extensions = full_join(tab, mime_df)
	mime_extensions[ mime_extensions$mime_type == "text/plain",]