Skip to content

Instantly share code, notes, and snippets.

@windytan
Last active October 24, 2017 10:26
Show Gist options
  • Save windytan/f009275a0188cab4d69534b5bcd62519 to your computer and use it in GitHub Desktop.
Save windytan/f009275a0188cab4d69534b5bcd62519 to your computer and use it in GitHub Desktop.
Markov-ketjuilla uusia sanoja suomenkielisestä lähdetekstistä
#!/usr/bin/perl
#
# Markov-tavutin
#
# (c) Oona Räisänen
# MIT license
#
use strict;
use warnings;
use utf8;
use English;
use v5.10;
use open ':encoding(UTF-8)';
binmode(STDOUT, ":utf8");
my %dict;
my %is_original;
my %is_beginning;
my @beginnings;
main();
sub print_usage {
say "Usage: perl $0 FILE [NUM]";
}
sub main {
if (@ARGV == 0) {
print_usage();
exit;
}
my $filename = shift @ARGV;
my $num_chains = ($ARGV[0] // 10);
make_dict($filename);
for (1 .. $num_chains) {
say make_chain();
}
return;
}
sub make_dict {
my $filename = shift;
open my $fh, '<', $filename or die $filename . ": " . $OS_ERROR;
my @lines = <$fh>;
close $fh;
for (@lines) {
chomp;
$_ = lc($_);
s/[^a-zäöåÄÖÅ ]/ /g;
my @words = split / +/;
for (@words) {
$is_original{$_} = 1;
my @tavut = split /-/, tavuta(lc($_));
for my $first (0 .. $#tavut-2) {
my $dict_first = $tavut[$first] . q{ } . $tavut[$first + 1];
my $dict_second = $tavut[$first + 2];
push @{$dict{$dict_first}}, $dict_second;
if ($first == 0) {
$is_beginning{$dict_first} ++;
}
}
}
}
@beginnings = keys %is_beginning;
return;
}
sub make_chain {
my $joined = q{};
while (1) {
my @chain = split / /, $beginnings[rand @beginnings];
while (1) {
my $pair = $chain[-2] . q{ } . $chain[-1];
last if (not exists $dict{$pair});
my @alts = @{$dict{$pair}};
push @chain, $alts[rand @alts];
}
$joined = join q{}, @chain;
last if (not exists $is_original{$joined});
}
$joined =~ s/ ([\.\,\!\?\:])/$1/g;
return $joined;
}
sub tavuta {
my $tulos = shift;
my $VO = "[aeiouyäöå]";
my $KO = "[bcdfghjklmnpqrstvwxz]";
# Konsonanttisääntö
$tulos =~ s/($VO$KO*)($KO$VO)/$1-$2/ig;
my @tavut = split /-/, $tulos;
s/($VO$KO*)($KO$VO)/$1-$2/ig for (@tavut);
$tulos = join "-", @tavut;
# Vokaalisääntö
@tavut = split /-/, $tulos;
for (@tavut) {
if (/^$KO*($VO)($VO)/i) {
my ($a, $b) = ($1, $2);
if ($a ne $b && $b ne "i" &&
$a.$b !~ /(?:ie|uo|yö|[aeio]u|[eiäö]y)/) {
s/($VO)($VO)/$1-$2/i;
}
}
}
$tulos = join "-", @tavut;
# Diftongisääntö
@tavut = split /-/, $tulos;
s/($VO{2})($VO)/$1-$2/i for (@tavut);
$tulos = join "-", @tavut;
return $tulos;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment