sebres · November 5, 2021 19:45
diff --git a/detect-inv-chars.tcl b/detect-inv-chars.tcl
 #!/usr/bin/env tclsh
 # -------------------------------------------------------------------------
 # Script to detect following zero-width or invisible characters:
 #
 #   LRE U+202A RLE U+202B LRO U+202D RLO U+202E LRI U+2066 RLI U+2067 FSI U+2068 PDF U+202C PDI U+2069
 #   ZWS U+200B ZWNJ U+200C ZWJ U+200D ZWNBSP U+FEFF
 # 
 # Copyright (c) 2016-2020 by Sergey G. Brester aka sebres
 # -------------------------------------------------------------------------

 variable INV_CHARS {
  LRE \u202A RLE \u202B LRO \u202D RLO \u202E LRI \u2066 RLI \u2067 FSI \u2068 PDF \u202C PDI \u2069
  ZWS \u200B ZWNJ \u200C ZWJ \u200D ZWNBSP \uFEFF
 }
 variable INV_CHARS_RPL [apply {{inch} {
  set m {\x01 \x02}
  foreach {ab ch} $inch {
    lappend m $ch "\x01$ab\x02"
  }; set m
 }} $INV_CHARS]

 ## --------------------------------------

 proc findInvChar {fn} {
  variable INV_CHARS
  variable INV_CHARS_RPL

  set f [open $fn rb]
  try {

    fconfigure $f -encoding utf-8 -buffersize 65536 -translation lf
    set i 0
    while {![eof $f]} {
      set buf [read $f 65000]
      append buf [gets $f]; # simple avoiding split buffer in the middle of utf-8 char
      set l [string length $buf]
      set buf [string map $INV_CHARS_RPL $buf]
      if {[string length $buf] != $l} {
        set p 0
        while {[set p [string first \x01 $buf $p]] != -1} {
          regexp -start $p {\x01([^\x02]+)\x02} $buf ch ch
          # ignore BOM (at start only):
          if {$i + $p != 0 || $ch ne "ZWNBSP"} {
            # found:
            if {![array size fnd]} {
              puts [format "%s:\n  found first invisible char U+%X \[%s\] at %u" \
                      $fn [scan [string map $INV_CHARS $ch] %c] $ch [expr {$i + $p}]]
            }
            incr fnd($ch)
            #return 1
          }
          incr p 2; incr p [string length $ch]
        }
      }
      incr i $l; incr i; # buffer offset (current length + NL)
    }

  } finally {
    close $f
  }
  if {[array size fnd]} {
    puts "  hits (char # count):"
    foreach {ch cnt} [lsort -stride 2 -index 1 -integer -decreasing [array get fnd]] {
      puts [format "  \tU+%X \[%s\]\t # %u" [scan [string map $INV_CHARS $ch] %c] $ch $cnt]
    }
    return 1
  }
  return 0
 }

 ## --------------------------------------

 proc isBinaryFile {fn} {
  # may be to slow:
  #set r [regexp {binary$} [exec file --mime-encoding $fn]]

  # simplest algorithm to check file is binary:
  # by list of known binary extensions:
  if {[dict exists {
      3dm 1 3ds 1 3g2 1 3gp 1 7z 1 a 1 aac 1 adp 1 ai 1 aif 1 aiff 1 alz 1 ape 1 apk 1 appimage 1 ar 1 arj 1 asf 1 au 1 avi 1
      bak 1 baml 1 bh 1 bin 1 bk 1 bmp 1 btif 1 bz2 1 bzip2 1 cab 1 caf 1 cgm 1 class 1 cmx 1 cpio 1 cr2 1 cur 1 dat 1 dcm 1
      deb 1 dex 1 djvu 1 dll 1 dmg 1 dng 1 doc 1 docm 1 docx 1 dot 1 dotm 1 dra 1 DS_Store 1 dsk 1 dts 1 dtshd 1 dvb 1 dwg 1
      dxf 1 ecelp4800 1 ecelp7470 1 ecelp9600 1 egg 1 eol 1 eot 1 epub 1 exe 1 f4v 1 fbs 1 fh 1 fla 1 flac 1 flatpak 1 fli 1
      flv 1 fpx 1 fst 1 fvt 1 g3 1 gh 1 gif 1 graffle 1 gz 1 gzip 1 h261 1 h263 1 h264 1 icns 1 ico 1 ief 1 img 1 ipa 1 iso 1
      jar 1 jpeg 1 jpg 1 jpgv 1 jpm 1 jxr 1 key 1 ktx 1 lha 1 lib 1 lvp 1 lz 1 lzh 1 lzma 1 lzo 1 m3u 1 m4a 1 m4v 1 mar 1 mdi 1
      mht 1 mid 1 midi 1 mj2 1 mka 1 mkv 1 mmr 1 mng 1 mobi 1 mov 1 movie 1 mp3 1 mp4 1 mp4a 1 mpeg 1 mpg 1 mpga 1 mxu 1 nef 1
      npx 1 numbers 1 nupkg 1 o 1 obj 1 odp 1 ods 1 odt 1 oga 1 ogg 1 ogv 1 otf 1 ott 1 pages 1 pbm 1 pcx 1 pdb 1 pdf 1 pea 1
      pgm 1 pic 1 png 1 pnm 1 pot 1 potm 1 potx 1 ppa 1 ppam 1 ppm 1 pps 1 ppsm 1 ppsx 1 ppt 1 pptm 1 pptx 1 psd 1 pya 1 pyc 1 
      pyo 1 pyv 1 qt 1 rar 1 ras 1 raw 1 resources 1 rgb 1 rip 1 rlc 1 rmf 1 rmvb 1 rpm 1 rtf 1 rz 1 s3m 1 s7z 1 scpt 1 sgi 1 shar 1
      snap 1 sil 1 sketch 1 slk 1 smv 1 snk 1 so 1 stl 1 suo 1 sub 1 swf 1 tar 1 tbz 1 tbz2 1 tga 1 tgz 1 thmx 1 tif 1 tiff 1
      tlz 1 ttc 1 ttf 1 txz 1 udf 1 uvh 1 uvi 1 uvm 1 uvp 1 uvs 1 uvu 1 viv 1 vob 1 war 1 wav 1 wax 1 wbmp 1 wdp 1 weba 1
      webm 1 webp 1 whl 1 wim 1 wm 1 wma 1 wmv 1 wmx 1 woff 1 woff2 1 wrm 1 wvx 1 xbm 1 xif 1 xla 1 xlam 1 xls 1 xlsb 1 xlsm 1
      xlsx 1 xlt 1 xltm 1 xltx 1 xm 1 xmind 1 xpi 1 xpm 1 xwd 1 xz 1 z 1 zip 1 zipx 1
    } [string tolower [string range [file extension $fn] 1 end]]]
  } {
    return 1
  }
  # if > 1% \x00 - \x08 chars or > 4% non utf-8 chars > \x80 in first 16KB:
  set r 0
  set f [open $fn rb]
  try {
    set buf [read $f 16384]
    set l [string length $buf]
    if {
      !$l ||
      double([regexp -all {[\x00-\x08]} $buf]) / $l > 0.01 ||
      double([regexp -all {[\x80-\xBF\xFE\xFF]} [regsub -all {[\xC0-\xFD][\x80-\xBF]{1,6}} $buf {}]]) / $l > 0.04
    } {
      set r 1
    }
  } finally {
    close $f
  }
  return $r
 }

 ## --------------------------------------

 proc findInvCharRecursive {dir {_stat {}}} {
  set rc 0
  if {$_stat ne {}} { upvar $_stat stat }
  foreach fn [glob -types d -nocomplain -directory $dir *] {
    if {$fn in {".git"}} {
      incr stat(ignoreddirs)
      continue
    }
    incr stat(dirs)
    incr rc [findInvCharRecursive $fn stat]
  }
  foreach fn [glob -types f -nocomplain -directory $dir *] {
    if {![isBinaryFile $fn]} {
      incr stat(files)
      incr rc [findInvChar $fn]
    } else {
      incr stat(ignoredfiles)
    }
  }
  return $rc
 }

 ## --------------------------------------

 if {[info exists ::argv] && $::argv0 ne [info nameofexecutable] && [file normalize $::argv0] eq [file normalize [info script]]} {

  # # switch to pcre if you can:
  # catch {interp regexp {} pcre}

  if {![llength $argv]} {
    set argv {.}
  }
  set rc 0
  set stat(files) 0
  foreach fn $argv {
    if {[file isdirectory $fn]} {
      incr stat(dirs)
      incr rc [findInvCharRecursive $fn stat]
    } else {
      incr stat(files)
      incr rc [findInvChar $fn]
    }
  }
  puts -nonewline "processed: $stat(files) file(s)"
  if {[info exists stat(dirs)]} {puts -nonewline " in $stat(dirs) dir(s)"}
  if {[info exists stat(ignoreddirs)] || [info exists stat(ignoredfiles)]} {
    puts -nonewline ", ignored:"
    if {[info exists stat(ignoredfiles)]} {puts -nonewline " $stat(ignoredfiles) file(s)"}
    if {[info exists stat(ignoreddirs)]} {puts -nonewline " $stat(ignoreddirs) dir(s)"}
    puts ""
  }
  if {$rc} {
    puts stderr "found $rc file(s)"
    exit 1
  } else {
    puts stderr "found $rc file(s)"
  }
 }
	#!/usr/bin/env tclsh
	# -------------------------------------------------------------------------
	# Script to detect following zero-width or invisible characters:
	#
	# LRE U+202A RLE U+202B LRO U+202D RLO U+202E LRI U+2066 RLI U+2067 FSI U+2068 PDF U+202C PDI U+2069
	# ZWS U+200B ZWNJ U+200C ZWJ U+200D ZWNBSP U+FEFF
	#
	# Copyright (c) 2016-2020 by Sergey G. Brester aka sebres
	# -------------------------------------------------------------------------

	variable INV_CHARS {
	LRE \u202A RLE \u202B LRO \u202D RLO \u202E LRI \u2066 RLI \u2067 FSI \u2068 PDF \u202C PDI \u2069
	ZWS \u200B ZWNJ \u200C ZWJ \u200D ZWNBSP \uFEFF
	}
	variable INV_CHARS_RPL [apply {{inch} {
	set m {\x01 \x02}
	foreach {ab ch} $inch {
	lappend m $ch "\x01$ab\x02"
	}; set m
	}} $INV_CHARS]

	## --------------------------------------

	proc findInvChar {fn} {
	variable INV_CHARS
	variable INV_CHARS_RPL

	set f [open $fn rb]
	try {

	fconfigure $f -encoding utf-8 -buffersize 65536 -translation lf
	set i 0
	while {![eof $f]} {
	set buf [read $f 65000]
	append buf [gets $f]; # simple avoiding split buffer in the middle of utf-8 char
	set l [string length $buf]
	set buf [string map $INV_CHARS_RPL $buf]
	if {[string length $buf] != $l} {
	set p 0
	while {[set p [string first \x01 $buf $p]] != -1} {
	regexp -start $p {\x01([^\x02]+)\x02} $buf ch ch
	# ignore BOM (at start only):
	if {$i + $p != 0 \|\| $ch ne "ZWNBSP"} {
	# found:
	if {![array size fnd]} {
	puts [format "%s:\n found first invisible char U+%X \[%s\] at %u" \
	$fn [scan [string map $INV_CHARS $ch] %c] $ch [expr {$i + $p}]]
	}
	incr fnd($ch)
	#return 1
	}
	incr p 2; incr p [string length $ch]
	}
	}
	incr i $l; incr i; # buffer offset (current length + NL)
	}

	} finally {
	close $f
	}
	if {[array size fnd]} {
	puts " hits (char # count):"
	foreach {ch cnt} [lsort -stride 2 -index 1 -integer -decreasing [array get fnd]] {
	puts [format " \tU+%X \[%s\]\t # %u" [scan [string map $INV_CHARS $ch] %c] $ch $cnt]
	}
	return 1
	}
	return 0
	}

	## --------------------------------------

	proc isBinaryFile {fn} {
	# may be to slow:
	#set r [regexp {binary$} [exec file --mime-encoding $fn]]

	# simplest algorithm to check file is binary:
	# by list of known binary extensions:
	if {[dict exists {
	3dm 1 3ds 1 3g2 1 3gp 1 7z 1 a 1 aac 1 adp 1 ai 1 aif 1 aiff 1 alz 1 ape 1 apk 1 appimage 1 ar 1 arj 1 asf 1 au 1 avi 1
	bak 1 baml 1 bh 1 bin 1 bk 1 bmp 1 btif 1 bz2 1 bzip2 1 cab 1 caf 1 cgm 1 class 1 cmx 1 cpio 1 cr2 1 cur 1 dat 1 dcm 1
	deb 1 dex 1 djvu 1 dll 1 dmg 1 dng 1 doc 1 docm 1 docx 1 dot 1 dotm 1 dra 1 DS_Store 1 dsk 1 dts 1 dtshd 1 dvb 1 dwg 1
	dxf 1 ecelp4800 1 ecelp7470 1 ecelp9600 1 egg 1 eol 1 eot 1 epub 1 exe 1 f4v 1 fbs 1 fh 1 fla 1 flac 1 flatpak 1 fli 1
	flv 1 fpx 1 fst 1 fvt 1 g3 1 gh 1 gif 1 graffle 1 gz 1 gzip 1 h261 1 h263 1 h264 1 icns 1 ico 1 ief 1 img 1 ipa 1 iso 1
	jar 1 jpeg 1 jpg 1 jpgv 1 jpm 1 jxr 1 key 1 ktx 1 lha 1 lib 1 lvp 1 lz 1 lzh 1 lzma 1 lzo 1 m3u 1 m4a 1 m4v 1 mar 1 mdi 1
	mht 1 mid 1 midi 1 mj2 1 mka 1 mkv 1 mmr 1 mng 1 mobi 1 mov 1 movie 1 mp3 1 mp4 1 mp4a 1 mpeg 1 mpg 1 mpga 1 mxu 1 nef 1
	npx 1 numbers 1 nupkg 1 o 1 obj 1 odp 1 ods 1 odt 1 oga 1 ogg 1 ogv 1 otf 1 ott 1 pages 1 pbm 1 pcx 1 pdb 1 pdf 1 pea 1
	pgm 1 pic 1 png 1 pnm 1 pot 1 potm 1 potx 1 ppa 1 ppam 1 ppm 1 pps 1 ppsm 1 ppsx 1 ppt 1 pptm 1 pptx 1 psd 1 pya 1 pyc 1
	pyo 1 pyv 1 qt 1 rar 1 ras 1 raw 1 resources 1 rgb 1 rip 1 rlc 1 rmf 1 rmvb 1 rpm 1 rtf 1 rz 1 s3m 1 s7z 1 scpt 1 sgi 1 shar 1
	snap 1 sil 1 sketch 1 slk 1 smv 1 snk 1 so 1 stl 1 suo 1 sub 1 swf 1 tar 1 tbz 1 tbz2 1 tga 1 tgz 1 thmx 1 tif 1 tiff 1
	tlz 1 ttc 1 ttf 1 txz 1 udf 1 uvh 1 uvi 1 uvm 1 uvp 1 uvs 1 uvu 1 viv 1 vob 1 war 1 wav 1 wax 1 wbmp 1 wdp 1 weba 1
	webm 1 webp 1 whl 1 wim 1 wm 1 wma 1 wmv 1 wmx 1 woff 1 woff2 1 wrm 1 wvx 1 xbm 1 xif 1 xla 1 xlam 1 xls 1 xlsb 1 xlsm 1
	xlsx 1 xlt 1 xltm 1 xltx 1 xm 1 xmind 1 xpi 1 xpm 1 xwd 1 xz 1 z 1 zip 1 zipx 1
	} [string tolower [string range [file extension $fn] 1 end]]]
	} {
	return 1
	}
	# if > 1% \x00 - \x08 chars or > 4% non utf-8 chars > \x80 in first 16KB:
	set r 0
	set f [open $fn rb]
	try {
	set buf [read $f 16384]
	set l [string length $buf]
	if {
	!$l \|\|
	double([regexp -all {[\x00-\x08]} $buf]) / $l > 0.01 \|\|
	double([regexp -all {[\x80-\xBF\xFE\xFF]} [regsub -all {[\xC0-\xFD][\x80-\xBF]{1,6}} $buf {}]]) / $l > 0.04
	} {
	set r 1
	}
	} finally {
	close $f
	}
	return $r
	}

	## --------------------------------------

	proc findInvCharRecursive {dir {_stat {}}} {
	set rc 0
	if {$_stat ne {}} { upvar $_stat stat }
	foreach fn [glob -types d -nocomplain -directory $dir *] {
	if {$fn in {".git"}} {
	incr stat(ignoreddirs)
	continue
	}
	incr stat(dirs)
	incr rc [findInvCharRecursive $fn stat]
	}
	foreach fn [glob -types f -nocomplain -directory $dir *] {
	if {![isBinaryFile $fn]} {
	incr stat(files)
	incr rc [findInvChar $fn]
	} else {
	incr stat(ignoredfiles)
	}
	}
	return $rc
	}

	## --------------------------------------

	if {[info exists ::argv] && $::argv0 ne [info nameofexecutable] && [file normalize $::argv0] eq [file normalize [info script]]} {

	# # switch to pcre if you can:
	# catch {interp regexp {} pcre}

	if {![llength $argv]} {
	set argv {.}
	}
	set rc 0
	set stat(files) 0
	foreach fn $argv {
	if {[file isdirectory $fn]} {
	incr stat(dirs)
	incr rc [findInvCharRecursive $fn stat]
	} else {
	incr stat(files)
	incr rc [findInvChar $fn]
	}
	}
	puts -nonewline "processed: $stat(files) file(s)"
	if {[info exists stat(dirs)]} {puts -nonewline " in $stat(dirs) dir(s)"}
	if {[info exists stat(ignoreddirs)] \|\| [info exists stat(ignoredfiles)]} {
	puts -nonewline ", ignored:"
	if {[info exists stat(ignoredfiles)]} {puts -nonewline " $stat(ignoredfiles) file(s)"}
	if {[info exists stat(ignoreddirs)]} {puts -nonewline " $stat(ignoreddirs) dir(s)"}
	puts ""
	}
	if {$rc} {
	puts stderr "found $rc file(s)"
	exit 1
	} else {
	puts stderr "found $rc file(s)"
	}
	}