Last active
February 16, 2023 18:50
-
-
Save JamoCA/fee34a03bbe61a2f8e40 to your computer and use it in GitHub Desktop.
ColdFusion UDF to sanitize filename & remove illegal characters & symbols that are incompatible/invalid when used with different languages, OS and devices.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<cfscript> | |
/* SanitizeFilename - Sanitize a string to be safe for use as a filename by removing directory paths and invalid characters. | |
SanitizeFilename(filename, rulesList, replacementCharacter); | |
rulesList: convertspacestounderscores, convertspacestodashes, convertdashtounderscore, convertunderscoretodash | |
rulesList: nodash, forcelowercase, forceuppercase | |
replacementCharacter: default = "" | |
NOTE: Lowercases extension | |
Written by SunStar Media https://www.sunstarmedia.com/ | |
3/18/2015 Initial UDF | |
4/25/2018 Updated w/optional ICU4J support (to convert foreign characters to Latin-ASCII) | |
https://gist.github.com/JamoCA/ec4617b066fc4bb601f620bc93bacb57 | |
Tested to pass NaughtyStrings test https://github.com/minimaxir/big-list-of-naughty-strings | |
3/11/2021 Modernized, added maxLength, default unsafeCharRegex and fixed some bugs on edge cases; | |
returns empty string if filename is "unrepairably safe". | |
*/ | |
public string function localSanitizeFileName( | |
string s="", | |
string rules="", | |
string replacementCharacter="", | |
numeric maxLength=255, | |
string unsafeCharRegex="[^A-Za-z0-9\-_]"unsafeCharRegex="[^A-Za-z0-9\-_]" | |
) output=false hint="Sanitize a string to be safe for use as a filename by removing directory paths and invalid characters." { | |
local.windowsReservedRe = "^(con|prn|aux|nul|com[0-9]|lpt[0-9])(\.[^.]*)?$"; | |
local.response = trim(javacast("string", arguments.s)); | |
local.response = listLast(listLast(local.response, "/"), "\").replaceAll("(?ms)\<[^>]*\>", ""); | |
if (left(local.response,1) is "." and listlen(local.response,".") is 1){ | |
return ""; | |
} | |
/* Transliterate ICU4J CFC (optional; converts foreign characters to latin equivalents | |
if (NOT structkeyExists(server, "Transliterator")){ | |
server.transliterator = createObject("component","transliterator"); | |
} | |
local.response = server.Transliterator.transliterate('Latin-ASCII; [:Nonspacing Mark:] Remove; NFC;', local.response); | |
*/ | |
/* JUnidecode https://github.com/gcardone/junidecode | |
if (not structkeyExists(server, "jUnidecodeLib")){ | |
server.jUnidecodeLib = createObject("java", "net.gcardone.junidecode.Junidecode"); | |
} | |
local.response = server.jUnidecodeLib.unidecode( local.response ); | |
local.response = trim(replaceNoCase(local.response, "[?]", "", "all")); | |
*/ | |
local.ext = ""; | |
if (listLen(local.response,".") gt 1){ | |
local.ext = trim(listLast(local.response, ".")); | |
} | |
local.response = trim(reReplaceNoCase(local.response, local.windowsReservedRe, "", "one")); | |
if (ListLen(local.response,".") gt 1){ | |
if ((len(local.response) - len(local.ext)-1) gt 0){ | |
local.response = left(local.response, len(local.response) - len(local.ext)-1); | |
} | |
local.ext = trim(reReplaceNoCase(local.ext, local.windowsReservedRe, "", "one")); | |
local.ext = rereplace(local.ext, "[^A-Za-z0-9]", "", "all"); | |
} | |
if (len(arguments.unsafeCharRegex)){ | |
local.response = local.response.replaceAll(arguments.unsafeCharRegex, arguments.replacementCharacter); | |
} | |
if (listFindNoCase(arguments.rules, "convertSpacesToUnderscores")){ | |
local.response = local.response.replaceAll(" ", "_"); | |
} else if (listFindNoCase(arguments.rules, "convertSpacesToDashes")){ | |
local.response = local.response.replaceAll(" ", "-"); | |
} | |
if (listFindNoCase(arguments.rules, "convertDashToUnderscore")){ | |
local.response = local.response.replaceAll("[\-]", "_"); | |
} else if (listFindNoCase(arguments.rules, "convertUnderscoreToDash")){ | |
local.response = local.response.replaceAll("[\_]", "-"); | |
} else { | |
if (listFindNoCase(arguments.rules, "noDash")){ | |
local.response = local.response.replaceAll("[\-]", arguments.replacementCharacter); | |
} | |
if (listFindNoCase(arguments.rules, "noUnderscore")){ | |
local.response = local.response.replaceAll("[\_]", arguments.replacementCharacter); | |
} | |
} | |
if (listFindNoCase(arguments.rules, "forceLowercase")){ | |
local.response = lCase(local.response); | |
} else if (listFindNoCase(arguments.rules, "forceUppercase")){ | |
local.response = uCase(local.response); | |
} | |
local.extLength = len( local.ext ); | |
if (local.extLength) local.extLength += 1; | |
if (arguments.maxLength gt 0 and (len(local.response)+local.extLength) gt arguments.maxLength){ | |
local.response = left(local.response, arguments.maxLength - local.extLength); | |
} | |
if (not local.extLength and not len(local.response)){ | |
local.response = ""; | |
} else if (not local.extLength){ | |
/* no extension */ | |
} else { | |
local.response = local.response & "." & lCase(local.ext); | |
} | |
return local.response; | |
} | |
</cfscript> |
I tend to avoid it, but filenames with multiple periods exist in abundance. They are very commonly used with minified resource files (jquery.min.js
) and subtitle SRT files to denote language (medianame.en.srt
).
Upon further review, I noticed some other patterns where commas and @
symbols are necessary and extended the UDF to accept a custom unsafeCharRegex
regex string to be used to prevent sanitizing desired characters. I just updated the UDF and unit tested it. Enjoy!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
NOTE; Someone commented to suggest an edit (& stated the've never seen more than 1 period in a filename) and then removed their comments.
If a filename has multiple periods, like
This.is.a.valid.imagename.jpg
, your solution will return onlyThis.jpg
instead ofThisisavalidimagename.jpg
NOTE: I did find another error (length truncation at the end) and will be updating this very soon.