Created
September 1, 2017 10:19
-
-
Save kokes/ac372156be99c08d7914ca12e0e86c8f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 232, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from czech_stemmer import cz_stem\n", | |
| "from glob import glob\n", | |
| "import json\n", | |
| "from collections import Counter, defaultdict\n", | |
| "import re\n", | |
| "import random" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 233, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "fns = glob('json/*.json')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 235, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "73190" | |
| ] | |
| }, | |
| "execution_count": 235, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dt = []\n", | |
| "for fn in fns:\n", | |
| " with open(fn) as f:\n", | |
| " dt.extend(json.load(f))\n", | |
| "\n", | |
| "random.shuffle(dt) # at to neni casove zavisle\n", | |
| "len(dt)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 96, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# tohle samo řeší tf idf, ne?\n", | |
| "stop = set('za, my, si, co, to, na, je, se, že, kter, tak, pan, byl, já, jak, \\\n", | |
| "do, bud, ted, vás, vám, pro, bod, tad, ve, měl, dan, jso, jsm, jsem, takh, tam, tom, \\\n", | |
| "aby, když, ano, ne, by, ale, mi, být, ta, tét, toh, už, ten, nen'.split(', '))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 97, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "vyr = defaultdict(list)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 98, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "for el in dt[:10000]:\n", | |
| " expr = [cz_stem(j) for j in re.findall(r'[^\\W\\d]+', el['text'].lower())]\n", | |
| " expr = [j for j in expr if len(j) > 1 and j not in stop]\n", | |
| " vyr[el['autor']].extend(expr)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 99, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Counter(vyr['Miroslava Němcová']).most_common()[:20]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 101, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('vlád', 108),\n", | |
| " ('zákon', 105),\n", | |
| " ('návrh', 101),\n", | |
| " ('občan', 94),\n", | |
| " ('vážen', 88),\n", | |
| " ('protoh', 74),\n", | |
| " ('neb', 74),\n", | |
| " ('česk', 67),\n", | |
| " ('bych', 63),\n", | |
| " ('úsvit', 56)]" | |
| ] | |
| }, | |
| "execution_count": 101, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "Counter(vyr['Tomio Okamura']).most_common()[:10]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## TF IDF" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 236, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
| "from sklearn.naive_bayes import MultinomialNB\n", | |
| "from sklearn.pipeline import make_pipeline\n", | |
| "\n", | |
| "model = make_pipeline(TfidfVectorizer(), MultinomialNB())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 237, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def cisti(text):\n", | |
| " return ' '.join([cz_stem(j) for j in re.findall(r'[^\\W\\d]+', text.lower())])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 294, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "proj_master = []\n", | |
| "aut_master = []\n", | |
| "\n", | |
| "for el in dt:\n", | |
| " if el['autor'] is None: continue\n", | |
| " \n", | |
| " aut_master.append(el['autor'])\n", | |
| " proj_master.append(cisti(el['text']))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 339, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'František Laudát',\n", | |
| " 'Jan Bartošek',\n", | |
| " 'Jan Hamáček',\n", | |
| " 'Jaroslava Jermanová',\n", | |
| " 'Miroslav Kalousek',\n", | |
| " 'Petr Gazdík',\n", | |
| " 'Radek Vondráček',\n", | |
| " 'Vojtěch Filip',\n", | |
| " 'Václav Votava',\n", | |
| " 'Zbyněk Stanjura'}" | |
| ] | |
| }, | |
| "execution_count": 339, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tp = set([j[0] for j in Counter(aut_master).most_common()[:10]])\n", | |
| "tp" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 358, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "28743" | |
| ] | |
| }, | |
| "execution_count": 358, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "aut = []\n", | |
| "proj = []\n", | |
| "\n", | |
| "for j in range(len(aut_master)):\n", | |
| " if aut_master[j] not in tp: continue\n", | |
| " if len(proj_master[j]) < 100: continue # TODO: too short?\n", | |
| " \n", | |
| " aut.append(aut_master[j])\n", | |
| " proj.append(proj_master[j])\n", | |
| " \n", | |
| "len(aut)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 359, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "25868" | |
| ] | |
| }, | |
| "execution_count": 359, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "nt = int(0.9*len(aut))\n", | |
| "nt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 360, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 2.9 s, sys: 80.6 ms, total: 2.98 s\n", | |
| "Wall time: 2.98 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "model.fit(proj[:nt], aut[:nt])\n", | |
| "pred = model.predict(proj[nt:])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "mapujem 20 lidi, ale jen 11 hadame?!" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 361, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(10, 7)" | |
| ] | |
| }, | |
| "execution_count": 361, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(list(set(aut[nt:]))), len(list(set(pred)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 362, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "true_cn = Counter(aut[nt:])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 363, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "55.791304347826085\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int,\n", | |
| " {'Jan Bartošek': 0.5492957746478861,\n", | |
| " 'Jan Hamáček': 0.11666666666666657,\n", | |
| " 'Jaroslava Jermanová': 0.07630522088353413,\n", | |
| " 'Miroslav Kalousek': 0.19594594594594583,\n", | |
| " 'Petr Gazdík': 0.6999999999999948,\n", | |
| " 'Vojtěch Filip': 0.9777777777777746,\n", | |
| " 'Zbyněk Stanjura': 0.8550724637681184})" | |
| ] | |
| }, | |
| "execution_count": 363, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "uhod = []\n", | |
| "uh = defaultdict(int)\n", | |
| "txt = []\n", | |
| "for j, pr in enumerate(pred):\n", | |
| " a, b = pr, aut[nt:][j]\n", | |
| " if a == b:\n", | |
| " uh[a] += 1/true_cn[a]\n", | |
| " uhod.append(a)\n", | |
| " txt.append(proj[nt:][j])\n", | |
| "\n", | |
| "print(100*len(uhod) / len(pred))\n", | |
| "uh" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment