Created
August 1, 2010 17:43
-
-
Save goerz/503568 to your computer and use it in GitHub Desktop.
latex_checkascii.pl: check latex files for non-ascii characters and optionally replaces them with tex-representations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use utf8; | |
############################################################################ | |
# Copyright (C) 2008 by Michael Goerz # | |
# http://www.physik.fu-berlin.de/~mgoerz # | |
# # | |
# This program is free software; you can redistribute it and/or modify # | |
# it under the terms of the GNU General Public License as published by # | |
# the Free Software Foundation; either version 3 of the License, or # | |
# (at your option) any later version. # | |
# # | |
# This program is distributed in the hope that it will be useful, # | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of # | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # | |
# GNU General Public License for more details. # | |
############################################################################ | |
my $usage = ' | |
Usage: latex_checkascii.pl [options] files | |
Options are: | |
--noreplace No replacements | |
--german Replacements of the german bable package | |
The program can be used to ensure that latex files are in pure ascii. | |
It does the following things: | |
- a few standard nonascii characters (e.g. em-dash) are replaced by their | |
tex representation, unless the --noreplace option is given. | |
- if the --german option is given, the conventions of | |
\usepackage[german]{babel} are followed. (eg. u-umlaut => "u) | |
- if the latex files contain an unescaped quote (") that is not part | |
of the known replacements, a warning is issued. | |
- if the latex files contain more than one consecutive period, a warning | |
is issuded (this should probably be a \dots) | |
- if after replacements are done there are characters left in the file | |
with a code point higher than 128, a warning is issued. | |
'; | |
my $german = 0; | |
my $check_for_quotes = 1; | |
my @files = (); | |
my %replacements = ( | |
'–' => '--', | |
'—' => '---', | |
'“' => '``', | |
'”' => "''", | |
'„' => '``', | |
'’' => "'", | |
); | |
foreach my $param (@ARGV){ | |
if ($param eq '--german'){ | |
$german = 1; | |
} elsif (($param eq '--help') or ($param eq '-h') or ($param eq '--usage') ){ | |
print $usage; | |
exit(0); | |
} elsif ( $param eq '--noreplace' ){ | |
%replacements = (); | |
} else { | |
push(@files, $param); | |
} | |
}; | |
if (@files == 0){ | |
die("You didn't provide a latex file to check\n"); | |
} | |
my %germanreplacements = ( | |
'ä' => '"a', | |
'ö' => '"o', | |
'ü' => '"u', | |
'Ä' => '"A', | |
'Ö' => '"O', | |
'Ü' => '"U', | |
'ß' => '"s', | |
'„' => '"`', | |
'”' => "\"'", | |
'«' => '"<', | |
'»' => '">', | |
'‹' => '\flq', | |
'›' => '\frq' | |
); | |
if ($german){ | |
# put in german replacements. | |
# If a replacement is defined both in the default dict and the german dict, | |
# german takes the preference. | |
%replacements = (%replacements, %germanreplacements) | |
} | |
foreach my $file (@files){ | |
open(INFILE, "<:utf8" , $file) or die("Couldn't open $file\n"); | |
die("$file.asciiout already exists\n") if (-f "$file.asciiout"); | |
open(OUTFILE, ">:utf8" , "$file.asciiout") or die("Couldn't open $file\n"); | |
print("Analyzing $file\n"); | |
my $linenumber = 0; | |
foreach my $line (<INFILE>){ | |
$linenumber++; | |
if ($check_for_quotes){ | |
if (($line =~ /(.?)(".?)/) and ($1 ne '\\')){ | |
# test membership of $2 in the values of the replacements hash | |
unless ( grep { $_ eq "$2" } values(%replacements) ) { | |
warn(" Line $linenumber contains unescaped quotation marks\n"); | |
} | |
} | |
} | |
if ($line =~ /\.{2,}/){ | |
warn(" Line $linenumber contains more than one consecutive period.\n"); | |
} | |
for my $letter (split '', $line){ | |
if (defined($replacements{$letter})){ | |
warn(" Replacing '$letter' with $replacements{$letter} in line $linenumber.\n"); | |
$letter = $replacements{$letter}; | |
} | |
if (ord($letter) > 128){ | |
warn(" Non-ascii letter '$letter' in line $linenumber\n"); | |
} | |
print OUTFILE $letter; | |
} | |
} | |
rename($file, "$file~"); | |
rename("$file.asciiout", $file); | |
} | |
print "Done\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment