kokes · September 1, 2017 10:19
diff --git a/steno-nlp.ipynb b/steno-nlp.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [],
   "source": [
    "from czech_stemmer import cz_stem\n",
    "from glob import glob\n",
    "import json\n",
    "from collections import Counter, defaultdict\n",
    "import re\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fns = glob('json/*.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "73190"
      ]
     },
     "execution_count": 235,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dt = []\n",
    "for fn in fns:\n",
    "    with open(fn) as f:\n",
    "        dt.extend(json.load(f))\n",
    "\n",
    "random.shuffle(dt) # at to neni casove zavisle\n",
    "len(dt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# tohle samo řeší tf idf, ne?\n",
    "stop = set('za, my, si, co, to, na, je, se, že, kter, tak, pan, byl, já, jak, \\\n",
    "do, bud, ted, vás, vám, pro, bod, tad, ve, měl, dan, jso, jsm, jsem, takh, tam, tom, \\\n",
    "aby, když, ano, ne, by, ale, mi, být, ta, tét, toh, už, ten, nen'.split(', '))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vyr = defaultdict(list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for el in dt[:10000]:\n",
    "    expr = [cz_stem(j) for j in re.findall(r'[^\\W\\d]+', el['text'].lower())]\n",
    "    expr = [j for j in expr if len(j) > 1 and j not in stop]\n",
    "    vyr[el['autor']].extend(expr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Counter(vyr['Miroslava Němcová']).most_common()[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('vlád', 108),\n",
       " ('zákon', 105),\n",
       " ('návrh', 101),\n",
       " ('občan', 94),\n",
       " ('vážen', 88),\n",
       " ('protoh', 74),\n",
       " ('neb', 74),\n",
       " ('česk', 67),\n",
       " ('bych', 63),\n",
       " ('úsvit', 56)]"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(vyr['Tomio Okamura']).most_common()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TF IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.pipeline import make_pipeline\n",
    "\n",
    "model = make_pipeline(TfidfVectorizer(), MultinomialNB())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def cisti(text):\n",
    "    return ' '.join([cz_stem(j) for j in re.findall(r'[^\\W\\d]+', text.lower())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [],
   "source": [
    "proj_master = []\n",
    "aut_master = []\n",
    "\n",
    "for el in dt:\n",
    "    if el['autor'] is None: continue\n",
    "        \n",
    "    aut_master.append(el['autor'])\n",
    "    proj_master.append(cisti(el['text']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'František Laudát',\n",
       " 'Jan Bartošek',\n",
       " 'Jan Hamáček',\n",
       " 'Jaroslava Jermanová',\n",
       " 'Miroslav Kalousek',\n",
       " 'Petr Gazdík',\n",
       " 'Radek Vondráček',\n",
       " 'Vojtěch Filip',\n",
       " 'Václav Votava',\n",
       " 'Zbyněk Stanjura'}"
      ]
     },
     "execution_count": 339,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tp = set([j[0] for j in Counter(aut_master).most_common()[:10]])\n",
    "tp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 358,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28743"
      ]
     },
     "execution_count": 358,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "aut = []\n",
    "proj = []\n",
    "\n",
    "for j in range(len(aut_master)):\n",
    "    if aut_master[j] not in tp: continue\n",
    "    if len(proj_master[j]) < 100: continue # TODO: too short?\n",
    "    \n",
    "    aut.append(aut_master[j])\n",
    "    proj.append(proj_master[j])\n",
    "    \n",
    "len(aut)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 359,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25868"
      ]
     },
     "execution_count": 359,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nt = int(0.9*len(aut))\n",
    "nt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.9 s, sys: 80.6 ms, total: 2.98 s\n",
      "Wall time: 2.98 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "model.fit(proj[:nt], aut[:nt])\n",
    "pred = model.predict(proj[nt:])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "mapujem 20 lidi, ale jen 11 hadame?!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 7)"
      ]
     },
     "execution_count": 361,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list(set(aut[nt:]))), len(list(set(pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 362,
   "metadata": {},
   "outputs": [],
   "source": [
    "true_cn = Counter(aut[nt:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "55.791304347826085\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'Jan Bartošek': 0.5492957746478861,\n",
       "             'Jan Hamáček': 0.11666666666666657,\n",
       "             'Jaroslava Jermanová': 0.07630522088353413,\n",
       "             'Miroslav Kalousek': 0.19594594594594583,\n",
       "             'Petr Gazdík': 0.6999999999999948,\n",
       "             'Vojtěch Filip': 0.9777777777777746,\n",
       "             'Zbyněk Stanjura': 0.8550724637681184})"
      ]
     },
     "execution_count": 363,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uhod = []\n",
    "uh = defaultdict(int)\n",
    "txt = []\n",
    "for j, pr in enumerate(pred):\n",
    "    a, b = pr, aut[nt:][j]\n",
    "    if a == b:\n",
    "        uh[a] += 1/true_cn[a]\n",
    "        uhod.append(a)\n",
    "        txt.append(proj[nt:][j])\n",
    "\n",
    "print(100*len(uhod) / len(pred))\n",
    "uh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 232,
	"metadata": {},
	"outputs": [],
	"source": [
	"from czech_stemmer import cz_stem\n",
	"from glob import glob\n",
	"import json\n",
	"from collections import Counter, defaultdict\n",
	"import re\n",
	"import random"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 233,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fns = glob('json/*.json')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 235,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"73190"
	]
	},
	"execution_count": 235,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dt = []\n",
	"for fn in fns:\n",
	" with open(fn) as f:\n",
	" dt.extend(json.load(f))\n",
	"\n",
	"random.shuffle(dt) # at to neni casove zavisle\n",
	"len(dt)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 96,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# tohle samo řeší tf idf, ne?\n",
	"stop = set('za, my, si, co, to, na, je, se, že, kter, tak, pan, byl, já, jak, \\\n",
	"do, bud, ted, vás, vám, pro, bod, tad, ve, měl, dan, jso, jsm, jsem, takh, tam, tom, \\\n",
	"aby, když, ano, ne, by, ale, mi, být, ta, tét, toh, už, ten, nen'.split(', '))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 97,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"vyr = defaultdict(list)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 98,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"for el in dt[:10000]:\n",
	" expr = [cz_stem(j) for j in re.findall(r'[^\\W\\d]+', el['text'].lower())]\n",
	" expr = [j for j in expr if len(j) > 1 and j not in stop]\n",
	" vyr[el['autor']].extend(expr)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 99,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Counter(vyr['Miroslava Němcová']).most_common()[:20]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 101,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('vlád', 108),\n",
	" ('zákon', 105),\n",
	" ('návrh', 101),\n",
	" ('občan', 94),\n",
	" ('vážen', 88),\n",
	" ('protoh', 74),\n",
	" ('neb', 74),\n",
	" ('česk', 67),\n",
	" ('bych', 63),\n",
	" ('úsvit', 56)]"
	]
	},
	"execution_count": 101,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Counter(vyr['Tomio Okamura']).most_common()[:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## TF IDF"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 236,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.naive_bayes import MultinomialNB\n",
	"from sklearn.pipeline import make_pipeline\n",
	"\n",
	"model = make_pipeline(TfidfVectorizer(), MultinomialNB())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 237,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def cisti(text):\n",
	" return ' '.join([cz_stem(j) for j in re.findall(r'[^\\W\\d]+', text.lower())])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 294,
	"metadata": {},
	"outputs": [],
	"source": [
	"proj_master = []\n",
	"aut_master = []\n",
	"\n",
	"for el in dt:\n",
	" if el['autor'] is None: continue\n",
	" \n",
	" aut_master.append(el['autor'])\n",
	" proj_master.append(cisti(el['text']))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 339,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'František Laudát',\n",
	" 'Jan Bartošek',\n",
	" 'Jan Hamáček',\n",
	" 'Jaroslava Jermanová',\n",
	" 'Miroslav Kalousek',\n",
	" 'Petr Gazdík',\n",
	" 'Radek Vondráček',\n",
	" 'Vojtěch Filip',\n",
	" 'Václav Votava',\n",
	" 'Zbyněk Stanjura'}"
	]
	},
	"execution_count": 339,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tp = set([j[0] for j in Counter(aut_master).most_common()[:10]])\n",
	"tp"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 358,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"28743"
	]
	},
	"execution_count": 358,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"aut = []\n",
	"proj = []\n",
	"\n",
	"for j in range(len(aut_master)):\n",
	" if aut_master[j] not in tp: continue\n",
	" if len(proj_master[j]) < 100: continue # TODO: too short?\n",
	" \n",
	" aut.append(aut_master[j])\n",
	" proj.append(proj_master[j])\n",
	" \n",
	"len(aut)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 359,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"25868"
	]
	},
	"execution_count": 359,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nt = int(0.9*len(aut))\n",
	"nt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 360,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 2.9 s, sys: 80.6 ms, total: 2.98 s\n",
	"Wall time: 2.98 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"model.fit(proj[:nt], aut[:nt])\n",
	"pred = model.predict(proj[nt:])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"mapujem 20 lidi, ale jen 11 hadame?!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 361,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(10, 7)"
	]
	},
	"execution_count": 361,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(list(set(aut[nt:]))), len(list(set(pred)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 362,
	"metadata": {},
	"outputs": [],
	"source": [
	"true_cn = Counter(aut[nt:])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 363,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"55.791304347826085\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"defaultdict(int,\n",
	" {'Jan Bartošek': 0.5492957746478861,\n",
	" 'Jan Hamáček': 0.11666666666666657,\n",
	" 'Jaroslava Jermanová': 0.07630522088353413,\n",
	" 'Miroslav Kalousek': 0.19594594594594583,\n",
	" 'Petr Gazdík': 0.6999999999999948,\n",
	" 'Vojtěch Filip': 0.9777777777777746,\n",
	" 'Zbyněk Stanjura': 0.8550724637681184})"
	]
	},
	"execution_count": 363,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"uhod = []\n",
	"uh = defaultdict(int)\n",
	"txt = []\n",
	"for j, pr in enumerate(pred):\n",
	" a, b = pr, aut[nt:][j]\n",
	" if a == b:\n",
	" uh[a] += 1/true_cn[a]\n",
	" uhod.append(a)\n",
	" txt.append(proj[nt:][j])\n",
	"\n",
	"print(100*len(uhod) / len(pred))\n",
	"uh"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
No results found