Last active
March 3, 2019 19:42
-
-
Save emjotde/4c5303e3b2fc501745ae016a8d1e8e49 to your computer and use it in GitHub Desktop.
Perl script for removing document with missing German umlauts from WMT19 Rapid corpus, expects tsv on stdin, produces tsv on stdout.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use utf8; | |
binmode(STDIN, ":utf8"); | |
binmode(STDOUT, ":utf8"); | |
sub checkAndPrint { | |
my @doc = @_; | |
my @temp = @doc; | |
shift(@temp); shift(@temp); shift(@temp); shift(@temp); shift(@temp); # remove first 5 lines, seems to be boilerplate and title sections with umlauts | |
my $all = join("\n", @temp); | |
if($all =~ /[äöüß]/i and $all !~ /\b(fr|prsident\p{L}*|europisch\p{L}*)\b/i) { | |
print join("\n", @doc), "\n"; | |
} | |
} | |
my @doc; | |
while(<STDIN>) { | |
chomp; | |
push(@doc, $_); | |
if(/^\s+$/) { # empty line marks document end | |
checkAndPrint(@doc); | |
@doc = (); | |
} | |
} | |
checkAndPrint(@doc); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment