Skip to content

Instantly share code, notes, and snippets.

@goerz
Created August 1, 2010 17:43
Show Gist options
  • Save goerz/503568 to your computer and use it in GitHub Desktop.
Save goerz/503568 to your computer and use it in GitHub Desktop.
latex_checkascii.pl: check latex files for non-ascii characters and optionally replaces them with tex-representations
#!/usr/bin/perl -w
use strict;
use utf8;
############################################################################
# Copyright (C) 2008 by Michael Goerz #
# http://www.physik.fu-berlin.de/~mgoerz #
# #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
############################################################################
my $usage = '
Usage: latex_checkascii.pl [options] files
Options are:
--noreplace No replacements
--german Replacements of the german bable package
The program can be used to ensure that latex files are in pure ascii.
It does the following things:
- a few standard nonascii characters (e.g. em-dash) are replaced by their
tex representation, unless the --noreplace option is given.
- if the --german option is given, the conventions of
\usepackage[german]{babel} are followed. (eg. u-umlaut => "u)
- if the latex files contain an unescaped quote (") that is not part
of the known replacements, a warning is issued.
- if the latex files contain more than one consecutive period, a warning
is issuded (this should probably be a \dots)
- if after replacements are done there are characters left in the file
with a code point higher than 128, a warning is issued.
';
my $german = 0;
my $check_for_quotes = 1;
my @files = ();
my %replacements = (
'–' => '--',
'—' => '---',
'“' => '``',
'”' => "''",
'„' => '``',
'’' => "'",
);
foreach my $param (@ARGV){
if ($param eq '--german'){
$german = 1;
} elsif (($param eq '--help') or ($param eq '-h') or ($param eq '--usage') ){
print $usage;
exit(0);
} elsif ( $param eq '--noreplace' ){
%replacements = ();
} else {
push(@files, $param);
}
};
if (@files == 0){
die("You didn't provide a latex file to check\n");
}
my %germanreplacements = (
'ä' => '"a',
'ö' => '"o',
'ü' => '"u',
'Ä' => '"A',
'Ö' => '"O',
'Ü' => '"U',
'ß' => '"s',
'„' => '"`',
'”' => "\"'",
'«' => '"<',
'»' => '">',
'‹' => '\flq',
'›' => '\frq'
);
if ($german){
# put in german replacements.
# If a replacement is defined both in the default dict and the german dict,
# german takes the preference.
%replacements = (%replacements, %germanreplacements)
}
foreach my $file (@files){
open(INFILE, "<:utf8" , $file) or die("Couldn't open $file\n");
die("$file.asciiout already exists\n") if (-f "$file.asciiout");
open(OUTFILE, ">:utf8" , "$file.asciiout") or die("Couldn't open $file\n");
print("Analyzing $file\n");
my $linenumber = 0;
foreach my $line (<INFILE>){
$linenumber++;
if ($check_for_quotes){
if (($line =~ /(.?)(".?)/) and ($1 ne '\\')){
# test membership of $2 in the values of the replacements hash
unless ( grep { $_ eq "$2" } values(%replacements) ) {
warn(" Line $linenumber contains unescaped quotation marks\n");
}
}
}
if ($line =~ /\.{2,}/){
warn(" Line $linenumber contains more than one consecutive period.\n");
}
for my $letter (split '', $line){
if (defined($replacements{$letter})){
warn(" Replacing '$letter' with $replacements{$letter} in line $linenumber.\n");
$letter = $replacements{$letter};
}
if (ord($letter) > 128){
warn(" Non-ascii letter '$letter' in line $linenumber\n");
}
print OUTFILE $letter;
}
}
rename($file, "$file~");
rename("$file.asciiout", $file);
}
print "Done\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment