Skip to content

Instantly share code, notes, and snippets.

@kokes
Created September 1, 2017 10:19
Show Gist options
  • Select an option

  • Save kokes/ac372156be99c08d7914ca12e0e86c8f to your computer and use it in GitHub Desktop.

Select an option

Save kokes/ac372156be99c08d7914ca12e0e86c8f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 232,
"metadata": {},
"outputs": [],
"source": [
"from czech_stemmer import cz_stem\n",
"from glob import glob\n",
"import json\n",
"from collections import Counter, defaultdict\n",
"import re\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 233,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fns = glob('json/*.json')"
]
},
{
"cell_type": "code",
"execution_count": 235,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"73190"
]
},
"execution_count": 235,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt = []\n",
"for fn in fns:\n",
" with open(fn) as f:\n",
" dt.extend(json.load(f))\n",
"\n",
"random.shuffle(dt) # at to neni casove zavisle\n",
"len(dt)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# tohle samo řeší tf idf, ne?\n",
"stop = set('za, my, si, co, to, na, je, se, že, kter, tak, pan, byl, já, jak, \\\n",
"do, bud, ted, vás, vám, pro, bod, tad, ve, měl, dan, jso, jsm, jsem, takh, tam, tom, \\\n",
"aby, když, ano, ne, by, ale, mi, být, ta, tét, toh, už, ten, nen'.split(', '))"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"vyr = defaultdict(list)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for el in dt[:10000]:\n",
" expr = [cz_stem(j) for j in re.findall(r'[^\\W\\d]+', el['text'].lower())]\n",
" expr = [j for j in expr if len(j) > 1 and j not in stop]\n",
" vyr[el['autor']].extend(expr)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"# Counter(vyr['Miroslava Němcová']).most_common()[:20]"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('vlád', 108),\n",
" ('zákon', 105),\n",
" ('návrh', 101),\n",
" ('občan', 94),\n",
" ('vážen', 88),\n",
" ('protoh', 74),\n",
" ('neb', 74),\n",
" ('česk', 67),\n",
" ('bych', 63),\n",
" ('úsvit', 56)]"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(vyr['Tomio Okamura']).most_common()[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF IDF"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())"
]
},
{
"cell_type": "code",
"execution_count": 237,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def cisti(text):\n",
" return ' '.join([cz_stem(j) for j in re.findall(r'[^\\W\\d]+', text.lower())])"
]
},
{
"cell_type": "code",
"execution_count": 294,
"metadata": {},
"outputs": [],
"source": [
"proj_master = []\n",
"aut_master = []\n",
"\n",
"for el in dt:\n",
" if el['autor'] is None: continue\n",
" \n",
" aut_master.append(el['autor'])\n",
" proj_master.append(cisti(el['text']))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 339,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'František Laudát',\n",
" 'Jan Bartošek',\n",
" 'Jan Hamáček',\n",
" 'Jaroslava Jermanová',\n",
" 'Miroslav Kalousek',\n",
" 'Petr Gazdík',\n",
" 'Radek Vondráček',\n",
" 'Vojtěch Filip',\n",
" 'Václav Votava',\n",
" 'Zbyněk Stanjura'}"
]
},
"execution_count": 339,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tp = set([j[0] for j in Counter(aut_master).most_common()[:10]])\n",
"tp"
]
},
{
"cell_type": "code",
"execution_count": 358,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"28743"
]
},
"execution_count": 358,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aut = []\n",
"proj = []\n",
"\n",
"for j in range(len(aut_master)):\n",
" if aut_master[j] not in tp: continue\n",
" if len(proj_master[j]) < 100: continue # TODO: too short?\n",
" \n",
" aut.append(aut_master[j])\n",
" proj.append(proj_master[j])\n",
" \n",
"len(aut)"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"25868"
]
},
"execution_count": 359,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nt = int(0.9*len(aut))\n",
"nt"
]
},
{
"cell_type": "code",
"execution_count": 360,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.9 s, sys: 80.6 ms, total: 2.98 s\n",
"Wall time: 2.98 s\n"
]
}
],
"source": [
"%%time\n",
"model.fit(proj[:nt], aut[:nt])\n",
"pred = model.predict(proj[nt:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"mapujem 20 lidi, ale jen 11 hadame?!"
]
},
{
"cell_type": "code",
"execution_count": 361,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10, 7)"
]
},
"execution_count": 361,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(list(set(aut[nt:]))), len(list(set(pred)))"
]
},
{
"cell_type": "code",
"execution_count": 362,
"metadata": {},
"outputs": [],
"source": [
"true_cn = Counter(aut[nt:])"
]
},
{
"cell_type": "code",
"execution_count": 363,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"55.791304347826085\n"
]
},
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'Jan Bartošek': 0.5492957746478861,\n",
" 'Jan Hamáček': 0.11666666666666657,\n",
" 'Jaroslava Jermanová': 0.07630522088353413,\n",
" 'Miroslav Kalousek': 0.19594594594594583,\n",
" 'Petr Gazdík': 0.6999999999999948,\n",
" 'Vojtěch Filip': 0.9777777777777746,\n",
" 'Zbyněk Stanjura': 0.8550724637681184})"
]
},
"execution_count": 363,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uhod = []\n",
"uh = defaultdict(int)\n",
"txt = []\n",
"for j, pr in enumerate(pred):\n",
" a, b = pr, aut[nt:][j]\n",
" if a == b:\n",
" uh[a] += 1/true_cn[a]\n",
" uhod.append(a)\n",
" txt.append(proj[nt:][j])\n",
"\n",
"print(100*len(uhod) / len(pred))\n",
"uh"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment