cstorey · January 14, 2018 15:29
diff --git a/Metaphone Markov.ipynb b/Metaphone Markov.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import metaphone as mp\n",
    "import collections as cs\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('A', u'A'),\n",
       " ('be', u'P'),\n",
       " ('l', u'L'),\n",
       " ('mo', u'M'),\n",
       " ('schu', u'X'),\n",
       " ('s', u'S')]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def split_to_phones(word):\n",
    "    word_mp = mp.doublemetaphone(word)[0]\n",
    "    prev_w_idx = 0\n",
    "    prev_mp_idx =0\n",
    "    for x in xrange(1, len(word)+1):\n",
    "        w0, w1 = (word[:x], word[x:])\n",
    "        m0, m1 = (mp.doublemetaphone(w0)[0], mp.doublemetaphone(w1)[0])\n",
    "        if (m0 + m1) == word_mp:\n",
    "            word_slice = w0[prev_w_idx:]\n",
    "            mp_slice = m0[prev_mp_idx:]\n",
    "            if mp_slice and word_slice:\n",
    "                yield (word_slice, mp_slice)\n",
    "            prev_w_idx = len(w0)\n",
    "            prev_mp_idx = len(m0)\n",
    "list(split_to_phones('Abelmoschus'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "words = [w for w in file('/usr/share/dict/words').read().split('\\n') if w.strip()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "157011"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.html.widgets import FloatProgress\n",
    "from IPython.display import display\n",
    "#display(f)\n",
    "#for i in xrange(100):\n",
    "#   sleep(0.1)\n",
    "#   f.value = i\n",
    "\n",
    "    \n",
    "window_size = 4\n",
    "mp_markov = cs.defaultdict(lambda: cs.defaultdict(int))\n",
    "\n",
    "total = len(words)\n",
    "f = FloatProgress(min=0, max=total)\n",
    "display(f)\n",
    "\n",
    "for n, w in enumerate(words):\n",
    "    if (n%100) == 0:\n",
    "        #print \n",
    "        f.value = n\n",
    "        f.description = \"{}/{} ({:4f}%)\".format(n, total, 100*float(n)/total)\n",
    "    \n",
    "    ws = []\n",
    "    for (w, m) in split_to_phones(w):\n",
    "        ws.append(w.lower())\n",
    "    ws.append(None)\n",
    "        \n",
    "    for idx in xrange(len(ws)-1):\n",
    "        preceeding = tuple(ws[max(0, idx-window_size+1):idx])\n",
    "        succ = ws[idx]\n",
    "        mp_markov[preceeding][succ] += 1\n",
    "\n",
    "f.value = total; f.description = \"Done\"\n",
    "len(mp_markov)\n",
    "# {m: {w: cnt for w, cnt in cnts.iteritems()} for m, cnts in mp_markov.iteritems()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(('ly', 'phy', 'llou'), defaultdict(<type 'int'>, {'s': 2})),\n",
       " (('a', 'thi'), defaultdict(<type 'int'>, {'r': 1, 'n': 1})),\n",
       " (('s', 'ki', 'r'),\n",
       "  defaultdict(<type 'int'>, {'ni': 1, 'ty': 1, 'mi': 6, 'l': 2, 'li': 1, 'p': 1, 't': 9, 'ti': 2, 'te': 4})),\n",
       " (('u', 'nou', 'two'), defaultdict(<type 'int'>, {'r': 1})),\n",
       " (('s', 'pi', 'nge'), defaultdict(<type 'int'>, {'l': 1})),\n",
       " (('vi', 'ce', 'n'), defaultdict(<type 'int'>, {'t': 3})),\n",
       " (('s', 'mo', 'si'), defaultdict(<type 'int'>, {'s': 16, 'te': 1, 'c': 1})),\n",
       " (('zzi', 'ni', 's'), defaultdict(<type 'int'>, {'t': 1})),\n",
       " (('ga', 'rrya'), defaultdict(<type 'int'>, {'ceae': 1})),\n",
       " (('a', 'm', 'phio'), defaultdict(<type 'int'>, {'ni': 1, 'n': 1}))]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mp_markov.items()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def markovify():\n",
    "    state = ()\n",
    "    while True:\n",
    "        #print state, state[-(window_size-1):]\n",
    "\n",
    "        options = mp_markov.get(state, {})\n",
    "        if not options:\n",
    "            return\n",
    "        nxt = random.choice([o for o, cnt in options.iteritems() for _ in xrange(cnt)])\n",
    "        #print state, nxt#, options\n",
    "        yield nxt\n",
    "        state = state[-(window_size-2):] + (nxt,)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hippelatestringefulness\n",
      "savingness\n",
      "vary\n",
      "pholidota\n",
      "fiuman\n",
      "doundake\n",
      "bearwood\n",
      "fourthly\n",
      "overreligionistraddlebusternoscapethriftlessness\n",
      "adtevac\n",
      "fishhooks\n",
      "sion\n",
      "carmaniac\n",
      "certy\n",
      "frothiness\n",
      "hantlessly\n",
      "mizzly\n",
      "mealless\n",
      "futuristfulness\n",
      "dulcigenously\n",
      "snottiness\n",
      "casuistrych\n",
      "fastidiousness\n",
      "overassertiversatility\n",
      "viceversalgia\n",
      "phrasistlessly\n",
      "fourieristikon\n",
      "anthropicringmanshipmentaneously\n",
      "ammeline\n",
      "apiose\n"
     ]
    }
   ],
   "source": [
    "for _ in xrange(30):\n",
    "    print \"\".join(markovify())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import metaphone as mp\n",
	"import collections as cs\n",
	"import random"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('A', u'A'),\n",
	" ('be', u'P'),\n",
	" ('l', u'L'),\n",
	" ('mo', u'M'),\n",
	" ('schu', u'X'),\n",
	" ('s', u'S')]"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def split_to_phones(word):\n",
	" word_mp = mp.doublemetaphone(word)[0]\n",
	" prev_w_idx = 0\n",
	" prev_mp_idx =0\n",
	" for x in xrange(1, len(word)+1):\n",
	" w0, w1 = (word[:x], word[x:])\n",
	" m0, m1 = (mp.doublemetaphone(w0)[0], mp.doublemetaphone(w1)[0])\n",
	" if (m0 + m1) == word_mp:\n",
	" word_slice = w0[prev_w_idx:]\n",
	" mp_slice = m0[prev_mp_idx:]\n",
	" if mp_slice and word_slice:\n",
	" yield (word_slice, mp_slice)\n",
	" prev_w_idx = len(w0)\n",
	" prev_mp_idx = len(m0)\n",
	"list(split_to_phones('Abelmoschus'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"words = [w for w in file('/usr/share/dict/words').read().split('\\n') if w.strip()]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"157011"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from IPython.html.widgets import FloatProgress\n",
	"from IPython.display import display\n",
	"#display(f)\n",
	"#for i in xrange(100):\n",
	"# sleep(0.1)\n",
	"# f.value = i\n",
	"\n",
	" \n",
	"window_size = 4\n",
	"mp_markov = cs.defaultdict(lambda: cs.defaultdict(int))\n",
	"\n",
	"total = len(words)\n",
	"f = FloatProgress(min=0, max=total)\n",
	"display(f)\n",
	"\n",
	"for n, w in enumerate(words):\n",
	" if (n%100) == 0:\n",
	" #print \n",
	" f.value = n\n",
	" f.description = \"{}/{} ({:4f}%)\".format(n, total, 100*float(n)/total)\n",
	" \n",
	" ws = []\n",
	" for (w, m) in split_to_phones(w):\n",
	" ws.append(w.lower())\n",
	" ws.append(None)\n",
	" \n",
	" for idx in xrange(len(ws)-1):\n",
	" preceeding = tuple(ws[max(0, idx-window_size+1):idx])\n",
	" succ = ws[idx]\n",
	" mp_markov[preceeding][succ] += 1\n",
	"\n",
	"f.value = total; f.description = \"Done\"\n",
	"len(mp_markov)\n",
	"# {m: {w: cnt for w, cnt in cnts.iteritems()} for m, cnts in mp_markov.iteritems()}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(('ly', 'phy', 'llou'), defaultdict(<type 'int'>, {'s': 2})),\n",
	" (('a', 'thi'), defaultdict(<type 'int'>, {'r': 1, 'n': 1})),\n",
	" (('s', 'ki', 'r'),\n",
	" defaultdict(<type 'int'>, {'ni': 1, 'ty': 1, 'mi': 6, 'l': 2, 'li': 1, 'p': 1, 't': 9, 'ti': 2, 'te': 4})),\n",
	" (('u', 'nou', 'two'), defaultdict(<type 'int'>, {'r': 1})),\n",
	" (('s', 'pi', 'nge'), defaultdict(<type 'int'>, {'l': 1})),\n",
	" (('vi', 'ce', 'n'), defaultdict(<type 'int'>, {'t': 3})),\n",
	" (('s', 'mo', 'si'), defaultdict(<type 'int'>, {'s': 16, 'te': 1, 'c': 1})),\n",
	" (('zzi', 'ni', 's'), defaultdict(<type 'int'>, {'t': 1})),\n",
	" (('ga', 'rrya'), defaultdict(<type 'int'>, {'ceae': 1})),\n",
	" (('a', 'm', 'phio'), defaultdict(<type 'int'>, {'ni': 1, 'n': 1}))]"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"mp_markov.items()[:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 76,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def markovify():\n",
	" state = ()\n",
	" while True:\n",
	" #print state, state[-(window_size-1):]\n",
	"\n",
	" options = mp_markov.get(state, {})\n",
	" if not options:\n",
	" return\n",
	" nxt = random.choice([o for o, cnt in options.iteritems() for _ in xrange(cnt)])\n",
	" #print state, nxt#, options\n",
	" yield nxt\n",
	" state = state[-(window_size-2):] + (nxt,)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 84,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"hippelatestringefulness\n",
	"savingness\n",
	"vary\n",
	"pholidota\n",
	"fiuman\n",
	"doundake\n",
	"bearwood\n",
	"fourthly\n",
	"overreligionistraddlebusternoscapethriftlessness\n",
	"adtevac\n",
	"fishhooks\n",
	"sion\n",
	"carmaniac\n",
	"certy\n",
	"frothiness\n",
	"hantlessly\n",
	"mizzly\n",
	"mealless\n",
	"futuristfulness\n",
	"dulcigenously\n",
	"snottiness\n",
	"casuistrych\n",
	"fastidiousness\n",
	"overassertiversatility\n",
	"viceversalgia\n",
	"phrasistlessly\n",
	"fourieristikon\n",
	"anthropicringmanshipmentaneously\n",
	"ammeline\n",
	"apiose\n"
	]
	}
	],
	"source": [
	"for _ in xrange(30):\n",
	" print \"\".join(markovify())"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}