Last active
May 11, 2017 12:33
-
-
Save Measter/af7255757a98cc7eaf8741efcef92b8b to your computer and use it in GitHub Desktop.
Markov Chain Word Generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
namespace WordGenerator | |
{ | |
public class MarkovWordGenerator | |
{ | |
const int MAX_TRIES = 200; | |
private readonly Dictionary<string, Chain> m_chains = new Dictionary<string, Chain>(); | |
private readonly Dictionary<int, int> m_sampleLengths = new Dictionary<int, int>(); | |
private readonly List<string> m_used = new List<string>(); | |
private readonly List<char> m_separators = new List<char>(); | |
private readonly Dictionary<Tuple<int, int>, bool> m_filled = new Dictionary<Tuple<int, int>, bool>(); | |
private Random m_rand; | |
private int m_totalSampleSize; | |
private int m_order; | |
private char m_nullChar; | |
public MarkovWordGenerator( int order ) | |
{ | |
m_order = order < 1 ? 3 : order; | |
m_nullChar = (char)0; | |
m_totalSampleSize = 0; | |
} | |
public MarkovWordGenerator( BinaryReader br ) | |
{ | |
ReadRawData( br ); | |
} | |
public void SampleWords( IEnumerable<string> samples ) | |
{ | |
foreach( string s in samples ) | |
SampleWord( s ); | |
} | |
public void SampleWord( string s ) | |
{ | |
string nulledWord, token; | |
Chain entry; | |
AddSampleLength( s.Length ); | |
nulledWord = String.Concat( new string( m_nullChar, m_order ), s.ToUpper(), m_nullChar ); | |
for( int letter = 0; letter < nulledWord.Length - m_order; letter++ ) | |
{ | |
token = nulledWord.Substring( letter, m_order ); | |
if( m_chains.ContainsKey( token ) ) | |
entry = m_chains[token]; | |
else | |
{ | |
entry = new Chain( this ); | |
m_chains[token] = entry; | |
} | |
entry.AddCharacter( nulledWord[letter + m_order] ); | |
} | |
} | |
public void ResetUsedList() | |
{ | |
m_used.Clear(); | |
} | |
public string NextName( int minLength, int maxLength, bool repeat = false ) | |
{ | |
if( minLength < 1 ) | |
minLength = 1; | |
minLength += m_order; | |
maxLength += m_order; | |
Tuple<int, int> sizeTuple = new Tuple<int, int>( minLength, maxLength ); | |
if( !m_filled.ContainsKey( sizeTuple ) ) | |
m_filled[sizeTuple] = false; | |
string word; | |
if( !m_filled[sizeTuple] ) | |
{ | |
string token; | |
int tries = 0; | |
do | |
{ | |
if( maxLength == -1 + m_order ) | |
maxLength = GetSampledSize(); | |
word = new string( m_nullChar, m_order ); | |
while( word.Length < maxLength ) | |
{ | |
token = word.Substring( word.Length - m_order, m_order ); | |
char c = m_chains[token].GetLetter(); | |
if( c != m_nullChar ) | |
word += c; | |
else | |
break; | |
} | |
tries++; | |
if( tries >= MAX_TRIES ) | |
{ | |
word = null; | |
break; | |
} | |
if( word.Length >= minLength && !m_used.Contains( word ) ) | |
break; | |
} while( true ); | |
m_filled[sizeTuple] = tries == MAX_TRIES; | |
if( word != null && !m_used.Contains( word ) ) | |
m_used.Add( word ); | |
} else if( repeat ) | |
{ | |
word = m_used.Where( s => s.Length >= minLength && s.Length < maxLength ).RandomItem( m_rand ); | |
} else | |
{ | |
return null; | |
} | |
return word != null ? ToTitleCase( word ) : null; | |
} | |
public void SetRandom( Random rand ) | |
{ | |
m_rand = rand; | |
} | |
public void AddSeparators( params char[] seps ) | |
{ | |
foreach( char c in seps ) | |
if( !m_separators.Contains( c ) ) | |
m_separators.Add( c ); | |
} | |
private string ToTitleCase( string word ) | |
{ | |
char[] parts = word.ToCharArray( m_order, word.Length - m_order ); | |
for( int i = 1; i < parts.Length; i++ ) | |
{ | |
if( m_separators.Contains( parts[i - 1] ) ) | |
continue; | |
parts[i] = Char.ToLower( parts[i] ); | |
} | |
return new string( parts ); | |
} | |
private int GetSampledSize() | |
{ | |
int index = m_rand.Next( m_totalSampleSize ); | |
int num = 0; | |
foreach( var pair in m_sampleLengths ) | |
{ | |
if( pair.Value > index ) | |
{ | |
num = pair.Key; | |
break; | |
} | |
index -= pair.Value; | |
} | |
return num; | |
} | |
private void AddSampleLength( int len ) | |
{ | |
if( !m_sampleLengths.ContainsKey( len ) ) | |
m_sampleLengths.Add( len, 0 ); | |
m_sampleLengths[len]++; | |
m_totalSampleSize++; | |
} | |
public void DumpRawData( BinaryWriter bw ) | |
{ | |
bw.Write( m_order ); | |
bw.Write( m_nullChar ); | |
bw.Write( m_totalSampleSize ); | |
bw.Write( m_sampleLengths.Count ); | |
foreach( var pair in m_sampleLengths ) | |
{ | |
bw.Write( pair.Key ); | |
bw.Write( pair.Value ); | |
} | |
bw.Write( m_chains.Count ); | |
foreach( var chain in m_chains ) | |
{ | |
bw.Write( chain.Key ); | |
chain.Value.DumpData( bw ); | |
} | |
} | |
private void ReadRawData( BinaryReader br ) | |
{ | |
m_order = br.ReadInt32(); | |
m_nullChar = br.ReadChar(); | |
m_totalSampleSize = br.ReadInt32(); | |
int size = br.ReadInt32(); | |
for( int i = 0; i < size; i++ ) | |
m_sampleLengths.Add( br.ReadInt32(), br.ReadInt32() ); | |
size = br.ReadInt32(); | |
for( int i = 0; i < size; i++ ) | |
{ | |
m_chains.Add( br.ReadString(), Chain.ReadRawData( br, this ) ); | |
} | |
} | |
internal class Chain | |
{ | |
private readonly Dictionary<char, int> m_letters; | |
private int m_totalSize; | |
private readonly MarkovWordGenerator m_parentGen; | |
public Chain( MarkovWordGenerator gen ) | |
{ | |
m_letters = new Dictionary<char, int>(); | |
m_totalSize = 0; | |
m_parentGen = gen; | |
} | |
public void AddCharacter( char letter ) | |
{ | |
if( !m_letters.ContainsKey( letter ) ) | |
m_letters.Add( letter, 0 ); | |
m_letters[letter]++; | |
m_totalSize++; | |
} | |
public char GetLetter() | |
{ | |
int index = m_parentGen.m_rand.Next( m_totalSize ); | |
char letter = (char)0; | |
foreach( var pair in m_letters ) | |
{ | |
if( pair.Value > index ) | |
{ | |
letter = pair.Key; | |
break; | |
} | |
index -= pair.Value; | |
} | |
return letter; | |
} | |
public void DumpData( BinaryWriter bw ) | |
{ | |
bw.Write( m_totalSize ); | |
bw.Write( m_letters.Count ); | |
foreach( var pair in m_letters ) | |
{ | |
bw.Write( pair.Key ); | |
bw.Write( pair.Value ); | |
} | |
} | |
public static Chain ReadRawData( BinaryReader br, MarkovWordGenerator gen ) | |
{ | |
Chain ch = new Chain( gen ); | |
ch.m_totalSize = br.ReadInt32(); | |
int size = br.ReadInt32(); | |
for( int i = 0; i < size; i++ ) | |
ch.m_letters.Add( br.ReadChar(), br.ReadInt32() ); | |
return ch; | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
namespace WordGenerator | |
{ | |
public static class Extensions | |
{ | |
public static T RandomItem<T>( this IEnumerable<T> list, Random rand ) | |
{ | |
if( !list.Any() ) | |
return default( T ); | |
var enumerable = list as List<T> ?? list.ToList(); | |
return enumerable.ToList()[rand.Next( enumerable.Count() )]; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment