comstock · September 6, 2019 20:24
diff --git a/file_type_inventory.R b/file_type_inventory.R
 pacman::p_load(
  readr,
  httr,
  jsonlite,
  curl,
  stringr,
  dplyr,
  xtable,
  formattable,
  htmlTable,
  data.table,
  RCurl,
  plotly,
  scales,
  SparkR
 ) # load modules

 setwd("~/images/24bit")

 ######################
 ## mime types table ##
 ## editable table used to match file extetions with likely format types

 mimeTypes <-
  getURI(
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vQjkkqveEW5eiuO2D_rLIiEs1Whj0Y98m8aIxaKP4qetCebpdYkuW5_4awHcjI0V6DtE3vJ5-ftZJTE/pub?gid=789984673&single=true&output=csv"
  )
 mimeTypes <- read.csv(textConnection(mimeTypes))

 save(mimeTypes, file = "mimeTypes.Rda")

 mimeTypes$Extension <-
  trimws(mimeTypes$Extension) # trim any unseen whitespace
 mimeTypes$Extension <- tolower(mimeTypes$Extension)
 ## mime types table ##
 ######################

 #########################
 ## targeted files list ##
 ## make a list of all files in that have a dot-suffix where the suffix is 1 to 5 chars long
 fileList <- list.files(
  recursive = TRUE,
  # path = ".",
  all.files = TRUE,
  full.names = FALSE,
  include.dirs = FALSE,
  # pattern = ".*",
  pattern = ".*\\.\\w{1,5}$",
  ignore.case = TRUE
 )
 ## targeted files list ##
 #########################

 #########################
 ## ingnored files list ##
 ignored.fileList <- as.data.frame(list.files(
  recursive = TRUE,
  # path = ".",
  all.files = TRUE,
  full.names = FALSE,
  include.dirs = FALSE,
  pattern = ".*",
  # pattern = ".*\\.\\w{1,5}$",
  ignore.case = TRUE
 )) # make a list of all files

 colnames(ignored.fileList) <- "Extension"

 ignored.fileList <- ignored.fileList %>%
  filter(grepl(".*\\.\\w{1,5}$",Extension) == FALSE) # remove entries that match 'targeted' list
 ## ingnored files list ##
 #########################

 len.fileList <- length(fileList)
 len.fileList

 extList <- as.data.frame(gsub(".*(\\.\\w{1,5}$)", "\\1", fileList))

 len.extList <- length(extList)
 len.extList

 colnames(extList) <- "Extension"

 extList$Extension <-
  trimws(extList$Extension) # trim any unseen whitespace

 extList$Extension <- tolower(extList$Extension)

 files.found <-
  merge(
    x = extList,
    y = mimeTypes,
    by.x = "Extension",
    all.x = TRUE,
    ignore.case = TRUE,
    all.y = FALSE,
    incomparables = NA
  )

 tbl.ext.counts <- count(files.found,Extension) ; tbl.ext.counts

 colnames(tbl.ext.counts) <- c("Extension","Count")

 tbl.ext.counts <- merge(
  x = tbl.ext.counts,
  y = mimeTypes,
  by.x = "Extension",
  all.x = TRUE,
  ignore.case = TRUE,
  all.y = FALSE,
  incomparables = NA
 )

 tbl.ext.counts <- tbl.ext.counts %>%
  arrange(desc(Count))

 ###################
 ## mystery files ##
 blanks <- files.found %>%
  filter(is.na(Name) == TRUE)

 blanks <- count(blanks,Extension) ; blanks
 colnames(blanks) <- c("Extension","Count")

 blanks <- blanks %>%
  arrange(desc(Count))
 ## mystery files ##
 ###################
	pacman::p_load(
	readr,
	httr,
	jsonlite,
	curl,
	stringr,
	dplyr,
	xtable,
	formattable,
	htmlTable,
	data.table,
	RCurl,
	plotly,
	scales,
	SparkR
	) # load modules

	setwd("~/images/24bit")

	######################
	## mime types table ##
	## editable table used to match file extetions with likely format types

	mimeTypes <-
	getURI(
	"https://docs.google.com/spreadsheets/d/e/2PACX-1vQjkkqveEW5eiuO2D_rLIiEs1Whj0Y98m8aIxaKP4qetCebpdYkuW5_4awHcjI0V6DtE3vJ5-ftZJTE/pub?gid=789984673&single=true&output=csv"
	)
	mimeTypes <- read.csv(textConnection(mimeTypes))

	save(mimeTypes, file = "mimeTypes.Rda")

	mimeTypes$Extension <-
	trimws(mimeTypes$Extension) # trim any unseen whitespace
	mimeTypes$Extension <- tolower(mimeTypes$Extension)
	## mime types table ##
	######################

	#########################
	## targeted files list ##
	## make a list of all files in that have a dot-suffix where the suffix is 1 to 5 chars long
	fileList <- list.files(
	recursive = TRUE,
	# path = ".",
	all.files = TRUE,
	full.names = FALSE,
	include.dirs = FALSE,
	# pattern = ".*",
	pattern = ".*\\.\\w{1,5}$",
	ignore.case = TRUE
	)
	## targeted files list ##
	#########################

	#########################
	## ingnored files list ##
	ignored.fileList <- as.data.frame(list.files(
	recursive = TRUE,
	# path = ".",
	all.files = TRUE,
	full.names = FALSE,
	include.dirs = FALSE,
	pattern = ".*",
	# pattern = ".*\\.\\w{1,5}$",
	ignore.case = TRUE
	)) # make a list of all files

	colnames(ignored.fileList) <- "Extension"

	ignored.fileList <- ignored.fileList %>%
	filter(grepl(".*\\.\\w{1,5}$",Extension) == FALSE) # remove entries that match 'targeted' list
	## ingnored files list ##
	#########################

	len.fileList <- length(fileList)
	len.fileList

	extList <- as.data.frame(gsub(".*(\\.\\w{1,5}$)", "\\1", fileList))

	len.extList <- length(extList)
	len.extList

	colnames(extList) <- "Extension"

	extList$Extension <-
	trimws(extList$Extension) # trim any unseen whitespace

	extList$Extension <- tolower(extList$Extension)

	files.found <-
	merge(
	x = extList,
	y = mimeTypes,
	by.x = "Extension",
	all.x = TRUE,
	ignore.case = TRUE,
	all.y = FALSE,
	incomparables = NA
	)

	tbl.ext.counts <- count(files.found,Extension) ; tbl.ext.counts

	colnames(tbl.ext.counts) <- c("Extension","Count")

	tbl.ext.counts <- merge(
	x = tbl.ext.counts,
	y = mimeTypes,
	by.x = "Extension",
	all.x = TRUE,
	ignore.case = TRUE,
	all.y = FALSE,
	incomparables = NA
	)

	tbl.ext.counts <- tbl.ext.counts %>%
	arrange(desc(Count))

	###################
	## mystery files ##
	blanks <- files.found %>%
	filter(is.na(Name) == TRUE)

	blanks <- count(blanks,Extension) ; blanks
	colnames(blanks) <- c("Extension","Count")

	blanks <- blanks %>%
	arrange(desc(Count))
	## mystery files ##
	###################
No results found