Skip to content

Instantly share code, notes, and snippets.

@comstock
Last active September 6, 2019 20:24
Show Gist options
  • Select an option

  • Save comstock/10abbc59b4c677dc10693a41c4527e44 to your computer and use it in GitHub Desktop.

Select an option

Save comstock/10abbc59b4c677dc10693a41c4527e44 to your computer and use it in GitHub Desktop.
Generate an inventory of file types found in a filesystem
pacman::p_load(
readr,
httr,
jsonlite,
curl,
stringr,
dplyr,
xtable,
formattable,
htmlTable,
data.table,
RCurl,
plotly,
scales,
SparkR
) # load modules
setwd("~/images/24bit")
######################
## mime types table ##
## editable table used to match file extetions with likely format types
mimeTypes <-
getURI(
"https://docs.google.com/spreadsheets/d/e/2PACX-1vQjkkqveEW5eiuO2D_rLIiEs1Whj0Y98m8aIxaKP4qetCebpdYkuW5_4awHcjI0V6DtE3vJ5-ftZJTE/pub?gid=789984673&single=true&output=csv"
)
mimeTypes <- read.csv(textConnection(mimeTypes))
save(mimeTypes, file = "mimeTypes.Rda")
mimeTypes$Extension <-
trimws(mimeTypes$Extension) # trim any unseen whitespace
mimeTypes$Extension <- tolower(mimeTypes$Extension)
## mime types table ##
######################
#########################
## targeted files list ##
## make a list of all files in that have a dot-suffix where the suffix is 1 to 5 chars long
fileList <- list.files(
recursive = TRUE,
# path = ".",
all.files = TRUE,
full.names = FALSE,
include.dirs = FALSE,
# pattern = ".*",
pattern = ".*\\.\\w{1,5}$",
ignore.case = TRUE
)
## targeted files list ##
#########################
#########################
## ingnored files list ##
ignored.fileList <- as.data.frame(list.files(
recursive = TRUE,
# path = ".",
all.files = TRUE,
full.names = FALSE,
include.dirs = FALSE,
pattern = ".*",
# pattern = ".*\\.\\w{1,5}$",
ignore.case = TRUE
)) # make a list of all files
colnames(ignored.fileList) <- "Extension"
ignored.fileList <- ignored.fileList %>%
filter(grepl(".*\\.\\w{1,5}$",Extension) == FALSE) # remove entries that match 'targeted' list
## ingnored files list ##
#########################
len.fileList <- length(fileList)
len.fileList
extList <- as.data.frame(gsub(".*(\\.\\w{1,5}$)", "\\1", fileList))
len.extList <- length(extList)
len.extList
colnames(extList) <- "Extension"
extList$Extension <-
trimws(extList$Extension) # trim any unseen whitespace
extList$Extension <- tolower(extList$Extension)
files.found <-
merge(
x = extList,
y = mimeTypes,
by.x = "Extension",
all.x = TRUE,
ignore.case = TRUE,
all.y = FALSE,
incomparables = NA
)
tbl.ext.counts <- count(files.found,Extension) ; tbl.ext.counts
colnames(tbl.ext.counts) <- c("Extension","Count")
tbl.ext.counts <- merge(
x = tbl.ext.counts,
y = mimeTypes,
by.x = "Extension",
all.x = TRUE,
ignore.case = TRUE,
all.y = FALSE,
incomparables = NA
)
tbl.ext.counts <- tbl.ext.counts %>%
arrange(desc(Count))
###################
## mystery files ##
blanks <- files.found %>%
filter(is.na(Name) == TRUE)
blanks <- count(blanks,Extension) ; blanks
colnames(blanks) <- c("Extension","Count")
blanks <- blanks %>%
arrange(desc(Count))
## mystery files ##
###################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment