Last active
June 11, 2020 16:28
-
-
Save TheAthlete/01e0d82f2b592a238d4eb80537d26944 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use feature 'say'; | |
use utf8; | |
use open qw/:std :utf8/; | |
use FindBin qw/$Bin/; | |
use File::Slurper qw/read_lines/; | |
use DDP; | |
use List::Util qw/any/; | |
# TransliterateFilter | |
my @whitelist_urls = map { s/^\s+//g; s/\s+$//g; $_ } read_lines "$Bin/antimat-data/url.txt"; | |
sub is_whitelist_url($) { | |
my $url = shift; | |
return any { $url =~ /^\Q$_\E/ } @whitelist_urls; | |
} | |
my $url_re = qr{ | |
\b | |
( | |
(https?://)? | |
[^,\s()<>]+ | |
\. | |
(?: | |
(?:[\w\d]+) | |
| | |
(?: | |
[^,[[:punct:]]\s] | |
| | |
/ | |
)+ | |
) | |
) | |
}x; | |
sub transliterate_process($) { | |
my $input = shift; | |
my @urls; | |
$input =~ s/$url_re/ | |
if (is_whitelist_url($1)) { push @urls, $1; '!@#$%' . $#urls; } else { $1 } | |
/exg; | |
my %gost = (yo => "ё", ch => "ч", sh => "ш", ya => "я", Yo => "Ё", Ch => "Ч", Sh => "Ш", Ya => "Я", ye => "ие", YE => "Є",); | |
$input =~ s/$_/$gost{$_}/g for keys %gost; | |
$input =~ tr/abvgdejziklmnoprstyfhcuABVGDEJZIKLMNOPRSTYFHCUwW/абвгдежзиклмнопрстуфхцуАБВГДЕЖЗИКЛМНОПРСТЮФХЦУвВ/; | |
$input =~ s{!@#\$%(\d+)}{ $urls[$1] ? $urls[$1] : $& }eg if @urls; | |
return $input; | |
} | |
# say "$_ : " . is_whitelist_url($_) for qw/mail.ru abcde super.man/; | |
# CensorFilter | |
sub censor_process($) { | |
my $input = shift; | |
my @censor_patterns = map { s/^\s+//g; s/\s+$//g; $_ } read_lines "$Bin/antimat-data/pattern.txt"; | |
for (@censor_patterns) { | |
my $pattern = substr(s/(.)/$1+\\s*/gr, 0, -3); | |
$input =~ s/$pattern/ '*' x length($_) /eigx; | |
} | |
return $input; | |
} | |
# CommonFilter | |
sub common_process($$) { | |
my ($filename, $input) = @_; | |
my @common_patterns = map { s/^\s+//g; s/\s+$//g; $_ } read_lines $filename; | |
for (@common_patterns) { | |
$input =~ s/\Q$_\E/ '*' x length($&) /eg | |
} | |
return $input; | |
} | |
sub phone_process($) { | |
my $input = shift; | |
my $phone_fn = "$Bin/antimat-data/phone.txt"; | |
common_process($phone_fn, $input); | |
} | |
say phone_process censor_process transliterate_process "abcde mail.ru def"; | |
say phone_process censor_process transliterate_process "abcde ya.ru def"; | |
say phone_process censor_process transliterate_process "Опа-опа, срослась pizda и жопа, ya.ru этого не может быть, между ними должен быть хххууйййййй. P.S. Позвони мне по номеру +79261111111 и лайкни меня на mail.ru."; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment