Skip to content

Instantly share code, notes, and snippets.

@Measter
Last active May 11, 2017 12:33
Show Gist options
  • Save Measter/af7255757a98cc7eaf8741efcef92b8b to your computer and use it in GitHub Desktop.
Save Measter/af7255757a98cc7eaf8741efcef92b8b to your computer and use it in GitHub Desktop.
Markov Chain Word Generator
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace WordGenerator
{
public class MarkovWordGenerator
{
const int MAX_TRIES = 200;
private readonly Dictionary<string, Chain> m_chains = new Dictionary<string, Chain>();
private readonly Dictionary<int, int> m_sampleLengths = new Dictionary<int, int>();
private readonly List<string> m_used = new List<string>();
private readonly List<char> m_separators = new List<char>();
private readonly Dictionary<Tuple<int, int>, bool> m_filled = new Dictionary<Tuple<int, int>, bool>();
private Random m_rand;
private int m_totalSampleSize;
private int m_order;
private char m_nullChar;
public MarkovWordGenerator( int order )
{
m_order = order < 1 ? 3 : order;
m_nullChar = (char)0;
m_totalSampleSize = 0;
}
public MarkovWordGenerator( BinaryReader br )
{
ReadRawData( br );
}
public void SampleWords( IEnumerable<string> samples )
{
foreach( string s in samples )
SampleWord( s );
}
public void SampleWord( string s )
{
string nulledWord, token;
Chain entry;
AddSampleLength( s.Length );
nulledWord = String.Concat( new string( m_nullChar, m_order ), s.ToUpper(), m_nullChar );
for( int letter = 0; letter < nulledWord.Length - m_order; letter++ )
{
token = nulledWord.Substring( letter, m_order );
if( m_chains.ContainsKey( token ) )
entry = m_chains[token];
else
{
entry = new Chain( this );
m_chains[token] = entry;
}
entry.AddCharacter( nulledWord[letter + m_order] );
}
}
public void ResetUsedList()
{
m_used.Clear();
}
public string NextName( int minLength, int maxLength, bool repeat = false )
{
if( minLength < 1 )
minLength = 1;
minLength += m_order;
maxLength += m_order;
Tuple<int, int> sizeTuple = new Tuple<int, int>( minLength, maxLength );
if( !m_filled.ContainsKey( sizeTuple ) )
m_filled[sizeTuple] = false;
string word;
if( !m_filled[sizeTuple] )
{
string token;
int tries = 0;
do
{
if( maxLength == -1 + m_order )
maxLength = GetSampledSize();
word = new string( m_nullChar, m_order );
while( word.Length < maxLength )
{
token = word.Substring( word.Length - m_order, m_order );
char c = m_chains[token].GetLetter();
if( c != m_nullChar )
word += c;
else
break;
}
tries++;
if( tries >= MAX_TRIES )
{
word = null;
break;
}
if( word.Length >= minLength && !m_used.Contains( word ) )
break;
} while( true );
m_filled[sizeTuple] = tries == MAX_TRIES;
if( word != null && !m_used.Contains( word ) )
m_used.Add( word );
} else if( repeat )
{
word = m_used.Where( s => s.Length >= minLength && s.Length < maxLength ).RandomItem( m_rand );
} else
{
return null;
}
return word != null ? ToTitleCase( word ) : null;
}
public void SetRandom( Random rand )
{
m_rand = rand;
}
public void AddSeparators( params char[] seps )
{
foreach( char c in seps )
if( !m_separators.Contains( c ) )
m_separators.Add( c );
}
private string ToTitleCase( string word )
{
char[] parts = word.ToCharArray( m_order, word.Length - m_order );
for( int i = 1; i < parts.Length; i++ )
{
if( m_separators.Contains( parts[i - 1] ) )
continue;
parts[i] = Char.ToLower( parts[i] );
}
return new string( parts );
}
private int GetSampledSize()
{
int index = m_rand.Next( m_totalSampleSize );
int num = 0;
foreach( var pair in m_sampleLengths )
{
if( pair.Value > index )
{
num = pair.Key;
break;
}
index -= pair.Value;
}
return num;
}
private void AddSampleLength( int len )
{
if( !m_sampleLengths.ContainsKey( len ) )
m_sampleLengths.Add( len, 0 );
m_sampleLengths[len]++;
m_totalSampleSize++;
}
public void DumpRawData( BinaryWriter bw )
{
bw.Write( m_order );
bw.Write( m_nullChar );
bw.Write( m_totalSampleSize );
bw.Write( m_sampleLengths.Count );
foreach( var pair in m_sampleLengths )
{
bw.Write( pair.Key );
bw.Write( pair.Value );
}
bw.Write( m_chains.Count );
foreach( var chain in m_chains )
{
bw.Write( chain.Key );
chain.Value.DumpData( bw );
}
}
private void ReadRawData( BinaryReader br )
{
m_order = br.ReadInt32();
m_nullChar = br.ReadChar();
m_totalSampleSize = br.ReadInt32();
int size = br.ReadInt32();
for( int i = 0; i < size; i++ )
m_sampleLengths.Add( br.ReadInt32(), br.ReadInt32() );
size = br.ReadInt32();
for( int i = 0; i < size; i++ )
{
m_chains.Add( br.ReadString(), Chain.ReadRawData( br, this ) );
}
}
internal class Chain
{
private readonly Dictionary<char, int> m_letters;
private int m_totalSize;
private readonly MarkovWordGenerator m_parentGen;
public Chain( MarkovWordGenerator gen )
{
m_letters = new Dictionary<char, int>();
m_totalSize = 0;
m_parentGen = gen;
}
public void AddCharacter( char letter )
{
if( !m_letters.ContainsKey( letter ) )
m_letters.Add( letter, 0 );
m_letters[letter]++;
m_totalSize++;
}
public char GetLetter()
{
int index = m_parentGen.m_rand.Next( m_totalSize );
char letter = (char)0;
foreach( var pair in m_letters )
{
if( pair.Value > index )
{
letter = pair.Key;
break;
}
index -= pair.Value;
}
return letter;
}
public void DumpData( BinaryWriter bw )
{
bw.Write( m_totalSize );
bw.Write( m_letters.Count );
foreach( var pair in m_letters )
{
bw.Write( pair.Key );
bw.Write( pair.Value );
}
}
public static Chain ReadRawData( BinaryReader br, MarkovWordGenerator gen )
{
Chain ch = new Chain( gen );
ch.m_totalSize = br.ReadInt32();
int size = br.ReadInt32();
for( int i = 0; i < size; i++ )
ch.m_letters.Add( br.ReadChar(), br.ReadInt32() );
return ch;
}
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
namespace WordGenerator
{
public static class Extensions
{
public static T RandomItem<T>( this IEnumerable<T> list, Random rand )
{
if( !list.Any() )
return default( T );
var enumerable = list as List<T> ?? list.ToList();
return enumerable.ToList()[rand.Next( enumerable.Count() )];
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment