Skip to content

Instantly share code, notes, and snippets.

@sebres
Last active November 5, 2021 19:45
Show Gist options
  • Save sebres/3f61fd1f1559cf9bb6a2beffbc9ea4c5 to your computer and use it in GitHub Desktop.
Save sebres/3f61fd1f1559cf9bb6a2beffbc9ea4c5 to your computer and use it in GitHub Desktop.
detect-inv-chars.tcl -- detect invisible chars in files or directories
#!/usr/bin/env tclsh
# -------------------------------------------------------------------------
# Script to detect following zero-width or invisible characters:
#
# LRE U+202A RLE U+202B LRO U+202D RLO U+202E LRI U+2066 RLI U+2067 FSI U+2068 PDF U+202C PDI U+2069
# ZWS U+200B ZWNJ U+200C ZWJ U+200D ZWNBSP U+FEFF
#
# Copyright (c) 2016-2020 by Sergey G. Brester aka sebres
# -------------------------------------------------------------------------
variable INV_CHARS {
LRE \u202A RLE \u202B LRO \u202D RLO \u202E LRI \u2066 RLI \u2067 FSI \u2068 PDF \u202C PDI \u2069
ZWS \u200B ZWNJ \u200C ZWJ \u200D ZWNBSP \uFEFF
}
variable INV_CHARS_RPL [apply {{inch} {
set m {\x01 \x02}
foreach {ab ch} $inch {
lappend m $ch "\x01$ab\x02"
}; set m
}} $INV_CHARS]
## --------------------------------------
proc findInvChar {fn} {
variable INV_CHARS
variable INV_CHARS_RPL
set f [open $fn rb]
try {
fconfigure $f -encoding utf-8 -buffersize 65536 -translation lf
set i 0
while {![eof $f]} {
set buf [read $f 65000]
append buf [gets $f]; # simple avoiding split buffer in the middle of utf-8 char
set l [string length $buf]
set buf [string map $INV_CHARS_RPL $buf]
if {[string length $buf] != $l} {
set p 0
while {[set p [string first \x01 $buf $p]] != -1} {
regexp -start $p {\x01([^\x02]+)\x02} $buf ch ch
# ignore BOM (at start only):
if {$i + $p != 0 || $ch ne "ZWNBSP"} {
# found:
if {![array size fnd]} {
puts [format "%s:\n found first invisible char U+%X \[%s\] at %u" \
$fn [scan [string map $INV_CHARS $ch] %c] $ch [expr {$i + $p}]]
}
incr fnd($ch)
#return 1
}
incr p 2; incr p [string length $ch]
}
}
incr i $l; incr i; # buffer offset (current length + NL)
}
} finally {
close $f
}
if {[array size fnd]} {
puts " hits (char # count):"
foreach {ch cnt} [lsort -stride 2 -index 1 -integer -decreasing [array get fnd]] {
puts [format " \tU+%X \[%s\]\t # %u" [scan [string map $INV_CHARS $ch] %c] $ch $cnt]
}
return 1
}
return 0
}
## --------------------------------------
proc isBinaryFile {fn} {
# may be to slow:
#set r [regexp {binary$} [exec file --mime-encoding $fn]]
# simplest algorithm to check file is binary:
# by list of known binary extensions:
if {[dict exists {
3dm 1 3ds 1 3g2 1 3gp 1 7z 1 a 1 aac 1 adp 1 ai 1 aif 1 aiff 1 alz 1 ape 1 apk 1 appimage 1 ar 1 arj 1 asf 1 au 1 avi 1
bak 1 baml 1 bh 1 bin 1 bk 1 bmp 1 btif 1 bz2 1 bzip2 1 cab 1 caf 1 cgm 1 class 1 cmx 1 cpio 1 cr2 1 cur 1 dat 1 dcm 1
deb 1 dex 1 djvu 1 dll 1 dmg 1 dng 1 doc 1 docm 1 docx 1 dot 1 dotm 1 dra 1 DS_Store 1 dsk 1 dts 1 dtshd 1 dvb 1 dwg 1
dxf 1 ecelp4800 1 ecelp7470 1 ecelp9600 1 egg 1 eol 1 eot 1 epub 1 exe 1 f4v 1 fbs 1 fh 1 fla 1 flac 1 flatpak 1 fli 1
flv 1 fpx 1 fst 1 fvt 1 g3 1 gh 1 gif 1 graffle 1 gz 1 gzip 1 h261 1 h263 1 h264 1 icns 1 ico 1 ief 1 img 1 ipa 1 iso 1
jar 1 jpeg 1 jpg 1 jpgv 1 jpm 1 jxr 1 key 1 ktx 1 lha 1 lib 1 lvp 1 lz 1 lzh 1 lzma 1 lzo 1 m3u 1 m4a 1 m4v 1 mar 1 mdi 1
mht 1 mid 1 midi 1 mj2 1 mka 1 mkv 1 mmr 1 mng 1 mobi 1 mov 1 movie 1 mp3 1 mp4 1 mp4a 1 mpeg 1 mpg 1 mpga 1 mxu 1 nef 1
npx 1 numbers 1 nupkg 1 o 1 obj 1 odp 1 ods 1 odt 1 oga 1 ogg 1 ogv 1 otf 1 ott 1 pages 1 pbm 1 pcx 1 pdb 1 pdf 1 pea 1
pgm 1 pic 1 png 1 pnm 1 pot 1 potm 1 potx 1 ppa 1 ppam 1 ppm 1 pps 1 ppsm 1 ppsx 1 ppt 1 pptm 1 pptx 1 psd 1 pya 1 pyc 1
pyo 1 pyv 1 qt 1 rar 1 ras 1 raw 1 resources 1 rgb 1 rip 1 rlc 1 rmf 1 rmvb 1 rpm 1 rtf 1 rz 1 s3m 1 s7z 1 scpt 1 sgi 1 shar 1
snap 1 sil 1 sketch 1 slk 1 smv 1 snk 1 so 1 stl 1 suo 1 sub 1 swf 1 tar 1 tbz 1 tbz2 1 tga 1 tgz 1 thmx 1 tif 1 tiff 1
tlz 1 ttc 1 ttf 1 txz 1 udf 1 uvh 1 uvi 1 uvm 1 uvp 1 uvs 1 uvu 1 viv 1 vob 1 war 1 wav 1 wax 1 wbmp 1 wdp 1 weba 1
webm 1 webp 1 whl 1 wim 1 wm 1 wma 1 wmv 1 wmx 1 woff 1 woff2 1 wrm 1 wvx 1 xbm 1 xif 1 xla 1 xlam 1 xls 1 xlsb 1 xlsm 1
xlsx 1 xlt 1 xltm 1 xltx 1 xm 1 xmind 1 xpi 1 xpm 1 xwd 1 xz 1 z 1 zip 1 zipx 1
} [string tolower [string range [file extension $fn] 1 end]]]
} {
return 1
}
# if > 1% \x00 - \x08 chars or > 4% non utf-8 chars > \x80 in first 16KB:
set r 0
set f [open $fn rb]
try {
set buf [read $f 16384]
set l [string length $buf]
if {
!$l ||
double([regexp -all {[\x00-\x08]} $buf]) / $l > 0.01 ||
double([regexp -all {[\x80-\xBF\xFE\xFF]} [regsub -all {[\xC0-\xFD][\x80-\xBF]{1,6}} $buf {}]]) / $l > 0.04
} {
set r 1
}
} finally {
close $f
}
return $r
}
## --------------------------------------
proc findInvCharRecursive {dir {_stat {}}} {
set rc 0
if {$_stat ne {}} { upvar $_stat stat }
foreach fn [glob -types d -nocomplain -directory $dir *] {
if {$fn in {".git"}} {
incr stat(ignoreddirs)
continue
}
incr stat(dirs)
incr rc [findInvCharRecursive $fn stat]
}
foreach fn [glob -types f -nocomplain -directory $dir *] {
if {![isBinaryFile $fn]} {
incr stat(files)
incr rc [findInvChar $fn]
} else {
incr stat(ignoredfiles)
}
}
return $rc
}
## --------------------------------------
if {[info exists ::argv] && $::argv0 ne [info nameofexecutable] && [file normalize $::argv0] eq [file normalize [info script]]} {
# # switch to pcre if you can:
# catch {interp regexp {} pcre}
if {![llength $argv]} {
set argv {.}
}
set rc 0
set stat(files) 0
foreach fn $argv {
if {[file isdirectory $fn]} {
incr stat(dirs)
incr rc [findInvCharRecursive $fn stat]
} else {
incr stat(files)
incr rc [findInvChar $fn]
}
}
puts -nonewline "processed: $stat(files) file(s)"
if {[info exists stat(dirs)]} {puts -nonewline " in $stat(dirs) dir(s)"}
if {[info exists stat(ignoreddirs)] || [info exists stat(ignoredfiles)]} {
puts -nonewline ", ignored:"
if {[info exists stat(ignoredfiles)]} {puts -nonewline " $stat(ignoredfiles) file(s)"}
if {[info exists stat(ignoreddirs)]} {puts -nonewline " $stat(ignoreddirs) dir(s)"}
puts ""
}
if {$rc} {
puts stderr "found $rc file(s)"
exit 1
} else {
puts stderr "found $rc file(s)"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment