Last active
November 5, 2021 19:45
-
-
Save sebres/3f61fd1f1559cf9bb6a2beffbc9ea4c5 to your computer and use it in GitHub Desktop.
detect-inv-chars.tcl -- detect invisible chars in files or directories
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env tclsh | |
# ------------------------------------------------------------------------- | |
# Script to detect following zero-width or invisible characters: | |
# | |
# LRE U+202A RLE U+202B LRO U+202D RLO U+202E LRI U+2066 RLI U+2067 FSI U+2068 PDF U+202C PDI U+2069 | |
# ZWS U+200B ZWNJ U+200C ZWJ U+200D ZWNBSP U+FEFF | |
# | |
# Copyright (c) 2016-2020 by Sergey G. Brester aka sebres | |
# ------------------------------------------------------------------------- | |
variable INV_CHARS { | |
LRE \u202A RLE \u202B LRO \u202D RLO \u202E LRI \u2066 RLI \u2067 FSI \u2068 PDF \u202C PDI \u2069 | |
ZWS \u200B ZWNJ \u200C ZWJ \u200D ZWNBSP \uFEFF | |
} | |
variable INV_CHARS_RPL [apply {{inch} { | |
set m {\x01 \x02} | |
foreach {ab ch} $inch { | |
lappend m $ch "\x01$ab\x02" | |
}; set m | |
}} $INV_CHARS] | |
## -------------------------------------- | |
proc findInvChar {fn} { | |
variable INV_CHARS | |
variable INV_CHARS_RPL | |
set f [open $fn rb] | |
try { | |
fconfigure $f -encoding utf-8 -buffersize 65536 -translation lf | |
set i 0 | |
while {![eof $f]} { | |
set buf [read $f 65000] | |
append buf [gets $f]; # simple avoiding split buffer in the middle of utf-8 char | |
set l [string length $buf] | |
set buf [string map $INV_CHARS_RPL $buf] | |
if {[string length $buf] != $l} { | |
set p 0 | |
while {[set p [string first \x01 $buf $p]] != -1} { | |
regexp -start $p {\x01([^\x02]+)\x02} $buf ch ch | |
# ignore BOM (at start only): | |
if {$i + $p != 0 || $ch ne "ZWNBSP"} { | |
# found: | |
if {![array size fnd]} { | |
puts [format "%s:\n found first invisible char U+%X \[%s\] at %u" \ | |
$fn [scan [string map $INV_CHARS $ch] %c] $ch [expr {$i + $p}]] | |
} | |
incr fnd($ch) | |
#return 1 | |
} | |
incr p 2; incr p [string length $ch] | |
} | |
} | |
incr i $l; incr i; # buffer offset (current length + NL) | |
} | |
} finally { | |
close $f | |
} | |
if {[array size fnd]} { | |
puts " hits (char # count):" | |
foreach {ch cnt} [lsort -stride 2 -index 1 -integer -decreasing [array get fnd]] { | |
puts [format " \tU+%X \[%s\]\t # %u" [scan [string map $INV_CHARS $ch] %c] $ch $cnt] | |
} | |
return 1 | |
} | |
return 0 | |
} | |
## -------------------------------------- | |
proc isBinaryFile {fn} { | |
# may be to slow: | |
#set r [regexp {binary$} [exec file --mime-encoding $fn]] | |
# simplest algorithm to check file is binary: | |
# by list of known binary extensions: | |
if {[dict exists { | |
3dm 1 3ds 1 3g2 1 3gp 1 7z 1 a 1 aac 1 adp 1 ai 1 aif 1 aiff 1 alz 1 ape 1 apk 1 appimage 1 ar 1 arj 1 asf 1 au 1 avi 1 | |
bak 1 baml 1 bh 1 bin 1 bk 1 bmp 1 btif 1 bz2 1 bzip2 1 cab 1 caf 1 cgm 1 class 1 cmx 1 cpio 1 cr2 1 cur 1 dat 1 dcm 1 | |
deb 1 dex 1 djvu 1 dll 1 dmg 1 dng 1 doc 1 docm 1 docx 1 dot 1 dotm 1 dra 1 DS_Store 1 dsk 1 dts 1 dtshd 1 dvb 1 dwg 1 | |
dxf 1 ecelp4800 1 ecelp7470 1 ecelp9600 1 egg 1 eol 1 eot 1 epub 1 exe 1 f4v 1 fbs 1 fh 1 fla 1 flac 1 flatpak 1 fli 1 | |
flv 1 fpx 1 fst 1 fvt 1 g3 1 gh 1 gif 1 graffle 1 gz 1 gzip 1 h261 1 h263 1 h264 1 icns 1 ico 1 ief 1 img 1 ipa 1 iso 1 | |
jar 1 jpeg 1 jpg 1 jpgv 1 jpm 1 jxr 1 key 1 ktx 1 lha 1 lib 1 lvp 1 lz 1 lzh 1 lzma 1 lzo 1 m3u 1 m4a 1 m4v 1 mar 1 mdi 1 | |
mht 1 mid 1 midi 1 mj2 1 mka 1 mkv 1 mmr 1 mng 1 mobi 1 mov 1 movie 1 mp3 1 mp4 1 mp4a 1 mpeg 1 mpg 1 mpga 1 mxu 1 nef 1 | |
npx 1 numbers 1 nupkg 1 o 1 obj 1 odp 1 ods 1 odt 1 oga 1 ogg 1 ogv 1 otf 1 ott 1 pages 1 pbm 1 pcx 1 pdb 1 pdf 1 pea 1 | |
pgm 1 pic 1 png 1 pnm 1 pot 1 potm 1 potx 1 ppa 1 ppam 1 ppm 1 pps 1 ppsm 1 ppsx 1 ppt 1 pptm 1 pptx 1 psd 1 pya 1 pyc 1 | |
pyo 1 pyv 1 qt 1 rar 1 ras 1 raw 1 resources 1 rgb 1 rip 1 rlc 1 rmf 1 rmvb 1 rpm 1 rtf 1 rz 1 s3m 1 s7z 1 scpt 1 sgi 1 shar 1 | |
snap 1 sil 1 sketch 1 slk 1 smv 1 snk 1 so 1 stl 1 suo 1 sub 1 swf 1 tar 1 tbz 1 tbz2 1 tga 1 tgz 1 thmx 1 tif 1 tiff 1 | |
tlz 1 ttc 1 ttf 1 txz 1 udf 1 uvh 1 uvi 1 uvm 1 uvp 1 uvs 1 uvu 1 viv 1 vob 1 war 1 wav 1 wax 1 wbmp 1 wdp 1 weba 1 | |
webm 1 webp 1 whl 1 wim 1 wm 1 wma 1 wmv 1 wmx 1 woff 1 woff2 1 wrm 1 wvx 1 xbm 1 xif 1 xla 1 xlam 1 xls 1 xlsb 1 xlsm 1 | |
xlsx 1 xlt 1 xltm 1 xltx 1 xm 1 xmind 1 xpi 1 xpm 1 xwd 1 xz 1 z 1 zip 1 zipx 1 | |
} [string tolower [string range [file extension $fn] 1 end]]] | |
} { | |
return 1 | |
} | |
# if > 1% \x00 - \x08 chars or > 4% non utf-8 chars > \x80 in first 16KB: | |
set r 0 | |
set f [open $fn rb] | |
try { | |
set buf [read $f 16384] | |
set l [string length $buf] | |
if { | |
!$l || | |
double([regexp -all {[\x00-\x08]} $buf]) / $l > 0.01 || | |
double([regexp -all {[\x80-\xBF\xFE\xFF]} [regsub -all {[\xC0-\xFD][\x80-\xBF]{1,6}} $buf {}]]) / $l > 0.04 | |
} { | |
set r 1 | |
} | |
} finally { | |
close $f | |
} | |
return $r | |
} | |
## -------------------------------------- | |
proc findInvCharRecursive {dir {_stat {}}} { | |
set rc 0 | |
if {$_stat ne {}} { upvar $_stat stat } | |
foreach fn [glob -types d -nocomplain -directory $dir *] { | |
if {$fn in {".git"}} { | |
incr stat(ignoreddirs) | |
continue | |
} | |
incr stat(dirs) | |
incr rc [findInvCharRecursive $fn stat] | |
} | |
foreach fn [glob -types f -nocomplain -directory $dir *] { | |
if {![isBinaryFile $fn]} { | |
incr stat(files) | |
incr rc [findInvChar $fn] | |
} else { | |
incr stat(ignoredfiles) | |
} | |
} | |
return $rc | |
} | |
## -------------------------------------- | |
if {[info exists ::argv] && $::argv0 ne [info nameofexecutable] && [file normalize $::argv0] eq [file normalize [info script]]} { | |
# # switch to pcre if you can: | |
# catch {interp regexp {} pcre} | |
if {![llength $argv]} { | |
set argv {.} | |
} | |
set rc 0 | |
set stat(files) 0 | |
foreach fn $argv { | |
if {[file isdirectory $fn]} { | |
incr stat(dirs) | |
incr rc [findInvCharRecursive $fn stat] | |
} else { | |
incr stat(files) | |
incr rc [findInvChar $fn] | |
} | |
} | |
puts -nonewline "processed: $stat(files) file(s)" | |
if {[info exists stat(dirs)]} {puts -nonewline " in $stat(dirs) dir(s)"} | |
if {[info exists stat(ignoreddirs)] || [info exists stat(ignoredfiles)]} { | |
puts -nonewline ", ignored:" | |
if {[info exists stat(ignoredfiles)]} {puts -nonewline " $stat(ignoredfiles) file(s)"} | |
if {[info exists stat(ignoreddirs)]} {puts -nonewline " $stat(ignoreddirs) dir(s)"} | |
puts "" | |
} | |
if {$rc} { | |
puts stderr "found $rc file(s)" | |
exit 1 | |
} else { | |
puts stderr "found $rc file(s)" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment