Last active
August 29, 2015 14:07
-
-
Save garfieldnate/2226d5d52eb8994c1eb7 to your computer and use it in GitHub Desktop.
List non-ascii characters in input file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This will look through an input utf8 file and print a list | |
# of all of the non-ascii characters found there. Not good for | |
# big files, since it stores and sorts all occurrences. | |
use strict; | |
use warnings; | |
use charnames ':full'; | |
binmode STDOUT, 'utf8'; | |
open my $fh, '<:utf8', $ARGV[0]; | |
# find all non-ascii characters and save their contexts | |
my %chars; | |
while (my $line = <$fh>){ | |
while($line =~ /([^\x00-\x7F])/g){ | |
my $hex = sprintf("0x%x", ord $1); | |
push @{$chars{$hex}}, get_context($line, pos($line)); | |
} | |
} | |
# first print a report at the top listing all of the characters | |
for (sort keys %chars){ | |
print "$_ : " . charnames::viacode($_) . "\n"; | |
} | |
# then print their names, code points, and occurrences | |
for (sort keys %chars){ | |
print "$_ : " . charnames::viacode($_) . "\n"; | |
print " $_\n" for @{$chars{$_}}; | |
} | |
# return a context with five characters on each side, | |
# and underscores highlighting the character at the given | |
# position | |
sub get_context { | |
my ($line, $pos) = @_; | |
my ($front, $back); | |
if($pos < 5){ | |
$front = (' ' x 5 - $pos) . substr($line, 0, $pos-1); | |
}else{ | |
$front = substr($line, $pos - 6, 5); | |
} | |
my $length = length $line; | |
if($pos > $length - 6){ | |
$back = substr($line, $pos+1) . (' ' x 5 - ($length - $pos)); | |
}else{ | |
$back = substr($line, $pos, 5); | |
} | |
return "${front}[" . substr($line, $pos-1, 1) . "]$back"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment