garfieldnate · August 29, 2015 14:07
diff --git a/list_chars.pl b/list_chars.pl
 # This will look through an input utf8 file and print a list 
 # of all of the non-ascii characters found there. Not good for
 # big files, since it stores and sorts all occurrences.
 use strict;
 use warnings;
 use charnames ':full';
 binmode STDOUT, 'utf8';

 open my $fh, '<:utf8', $ARGV[0];

 # find all non-ascii characters and save their contexts
 my %chars;
 while (my $line = <$fh>){
    while($line =~ /([^\x00-\x7F])/g){
        my $hex = sprintf("0x%x", ord $1);
        push @{$chars{$hex}}, get_context($line, pos($line));
    }
 }

 # first print a report at the top listing all of the characters
 for (sort keys %chars){
    print "$_ : " . charnames::viacode($_) . "\n";
 }

 # then print their names, code points, and occurrences
 for (sort keys %chars){
    print "$_ : " . charnames::viacode($_) . "\n";
    print " $_\n" for @{$chars{$_}};
 }

 # return a context with five characters on each side,
 # and underscores highlighting the character at the given
 # position
 sub get_context {
    my ($line, $pos) = @_;
    my ($front, $back);
    if($pos < 5){
        $front = (' ' x 5 - $pos) . substr($line, 0, $pos-1);
    }else{
        $front = substr($line, $pos - 6, 5);
    }
    my $length = length $line;
    if($pos > $length - 6){
        $back = substr($line, $pos+1) . (' ' x 5 - ($length - $pos));
    }else{
        $back =  substr($line, $pos, 5);
    }
    return "${front}[" . substr($line, $pos-1, 1) . "]$back";
 }
	# This will look through an input utf8 file and print a list
	# of all of the non-ascii characters found there. Not good for
	# big files, since it stores and sorts all occurrences.
	use strict;
	use warnings;
	use charnames ':full';
	binmode STDOUT, 'utf8';

	open my $fh, '<:utf8', $ARGV[0];

	# find all non-ascii characters and save their contexts
	my %chars;
	while (my $line = <$fh>){
	while($line =~ /([^\x00-\x7F])/g){
	my $hex = sprintf("0x%x", ord $1);
	push @{$chars{$hex}}, get_context($line, pos($line));
	}
	}

	# first print a report at the top listing all of the characters
	for (sort keys %chars){
	print "$_ : " . charnames::viacode($_) . "\n";
	}

	# then print their names, code points, and occurrences
	for (sort keys %chars){
	print "$_ : " . charnames::viacode($_) . "\n";
	print " $_\n" for @{$chars{$_}};
	}

	# return a context with five characters on each side,
	# and underscores highlighting the character at the given
	# position
	sub get_context {
	my ($line, $pos) = @_;
	my ($front, $back);
	if($pos < 5){
	$front = (' ' x 5 - $pos) . substr($line, 0, $pos-1);
	}else{
	$front = substr($line, $pos - 6, 5);
	}
	my $length = length $line;
	if($pos > $length - 6){
	$back = substr($line, $pos+1) . (' ' x 5 - ($length - $pos));
	}else{
	$back = substr($line, $pos, 5);
	}
	return "${front}[" . substr($line, $pos-1, 1) . "]$back";
	}
No results found