Skip to content

Instantly share code, notes, and snippets.

@abcarroll
Created July 4, 2015 10:42
Show Gist options
  • Save abcarroll/e1927f3f3871c9816306 to your computer and use it in GitHub Desktop.
Save abcarroll/e1927f3f3871c9816306 to your computer and use it in GitHub Desktop.
Make a hopefully pronounceable word based on vowel, constant patterns. Patterns are pregenerated from analyzing dictionary files.
<?php
// A.B. Carroll, <[email protected]>
// "Do whatever the fuck you want license"
// Generates a 'statistics chunk' for use in pronounceable word generation
// It's a one-time generation, so it doesn't need to be fast.
$file = file('/usr/share/dict/american-english');
$patterns = [];
// Initial pass
$vowels = 'aeiou';
$vowels = str_split($vowels);
foreach($file as $word) {
$word = trim($word);
if(preg_match('/^[A-Za-z]+$/', $word)) {
//echo "Regular word: $word\n";
$pattern = '';
$word = str_split($word);
foreach($word as $letter) {
if(in_array($letter, $vowels)) {
$pattern .= 'v';
} else {
$pattern .= 'c';
}
}
if(!isset($patterns[strlen($pattern)])) {
$patterns[strlen($pattern)] = [];
}
if(!isset($patterns[strlen($pattern)][$pattern])) {
$patterns[strlen($pattern)][$pattern] = 0;
}
$patterns[strlen($pattern)][$pattern]++;
}
}
// Prune, sort, re-key
$sortedPatterns = [];
ksort($patterns, SORT_NUMERIC);
foreach($patterns as $length => $iPatterns) {
asort($iPatterns, SORT_NUMERIC);
$iPatterns = array_reverse(($iPatterns));
$total = array_sum($iPatterns);
foreach($iPatterns as $pattern => $count) {
$percentage = (int) round(($count / $total) * 100);
if($percentage > 2 && strlen($pattern) <= 10) {
// echo "$length => $pattern => $count ($percentage%)\n";
if(!isset($sortedPatterns[strlen($pattern)])) {
$sortedPatterns[strlen($pattern)] = [];
}
$sortedPatterns[strlen($pattern)][$percentage] = $pattern;
}
}
}
echo "Your statistics chunk:\n\n";
echo json_encode($sortedPatterns);
<?php
// A.B. Carroll, <[email protected]>
// "Do whatever the fuck you want license"
// Generates a pronouncable word based on pregenerated vowel,constant
// statistical data
// Sshh.. No words.. Only code now...
function pronounceableWord($length) {
$patternData = '{"1":{"90":"c","10":"v"},"2":{"51":"cc","35":"cv","15"' .
':"vc"},"3":{"64":"cvc","9":"cvv","7":"ccv","4":"vcv"},"4":{"39":"' .
'cvcc","20":"cvcv","14":"ccvc","3":"vcvc"},"5":{"24":"cvcvc","17":' .
'"cvccc","13":"ccvcc","9":"cvvcc","8":"cvccv","5":"ccvvc","3":"cvv' .
'cv"},"6":{"27":"cvccvc","12":"cvcvcc","7":"ccvcvc","6":"cvvcvc","' .
'5":"cvcvvc","3":"ccvvcc"},"7":{"20":"cvccvcc","9":"ccvccvc","8":"' .
'cvcvcvc","7":"cvccvvc","4":"ccvcvcc","3":"ccvvcvc"},"8":{"9":"cvc' .
'cvcvc","8":"ccvccvcc","7":"cvcvcvcc","6":"cvcccvcc","5":"cvcvccvc' .
'","4":"cvccvccc","3":"cvcvcvcv"},"9":{"9":"cvccvcvcc","6":"cvccvc' .
'cvc","5":"cvcvccvcc","4":"cvcvcvcvc","3":"ccvcccvcc"},"10":{"6":"' .
'cvccvccvcc","4":"cvcvcvcvcc","3":"cvcvccvcvc"}}';
$patternData = json_decode($patternData, true);
if(isset($patternData[$length])) {
$patternData = $patternData[$length];
$maxProbability = array_sum(array_keys($patternData));
$accumulatedProbability = 0;
$rand = mt_rand(0, $maxProbability); // the magic deciding number
$selectedPattern = '';
foreach($patternData as $probability => $pattern) {
$actualProbability = $probability + $accumulatedProbability;
$accumulatedProbability = $probability + $accumulatedProbability;
if($rand < $actualProbability) {
$selectedPattern = $pattern;
break;
}
}
$selectedPattern = str_split($selectedPattern);
$vowelConstants = [
'v' => str_split('aeiou'),
'c' => str_split('bcdfghklmnpqrstvwxyz')
];
$generatedWord = '';
foreach($selectedPattern as $letter) {
$vc = $vowelConstants[$letter];
$generatedWord .= $vc[array_rand($vc)];
}
return $generatedWord;
} else {
throw new \RuntimeException("Length of '$length' isn't supported by" .
"the pattern data provided.");
}
}
echo "And the winner is... ";
echo pronounceableWord(5);
echo "\n\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment