Last active
June 20, 2017 13:58
-
-
Save fkohlgrueber/f3cbf135d1e971b2fce4607ddbafbbd7 to your computer and use it in GitHub Desktop.
KogSys Exercise 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Übungsblatt 3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Aufgabe 1\n", | |
"\n", | |
"### Onlinefrage Nr. 1: Welche Eigenschaften gelten für einen Autoencoder?\n", | |
" 1. **unüberwacht**\n", | |
" 2. überwacht\n", | |
" 3. **nicht-parametrisch**\n", | |
" 4. parametrisch" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Aufgabe 2\n", | |
"\n", | |
"### a)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import defaultdict\n", | |
"S = ['Start', 'Mitte', 'Ende']\n", | |
"V = ['A', 'B', 'C']\n", | |
"A = defaultdict(lambda: defaultdict(int))\n", | |
"A['Start']['Start'] = 0.8\n", | |
"A['Start']['Mitte'] = 0.2\n", | |
"A['Mitte']['Mitte'] = 0.4\n", | |
"A['Mitte']['Ende'] = 0.6\n", | |
"A['Ende']['Ende'] = 1.0\n", | |
"B = {'Start': {'A': 0.5, 'B': 0.2, 'C': 0.3},\n", | |
" 'Mitte': {'A': 0.1, 'B': 0.1, 'C': 0.8},\n", | |
" 'Ende': {'A': 0.3, 'B': 0.3, 'C': 0.4}}\n", | |
"PI = {'Start': 1.0, 'Mitte': 0.0, 'Ende': 0.0}\n", | |
"inputs = ['ABC', 'CBB', 'CCC']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ABC: 0.003552\n", | |
"CBB: 0.010656\n", | |
"CCC: 0.036864\n" | |
] | |
} | |
], | |
"source": [ | |
"def forward_algorithm(s): \n", | |
" Q = [PI]\n", | |
" for c in s:\n", | |
" new_q = defaultdict(int)\n", | |
" for state in S:\n", | |
" for new_state in A[state]:\n", | |
" new_q[new_state] += Q[-1][state] * A[state][new_state] * B[new_state][c]\n", | |
" Q.append(new_q)\n", | |
" return round(Q[-1]['Ende'], 10)\n", | |
"\n", | |
"for s in inputs:\n", | |
" print(f\"{s}: {forward_algorithm(s)}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### b)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Q = ['Start', 'Start', 'Mitte', 'Ende']\n" | |
] | |
} | |
], | |
"source": [ | |
"def calc_max(func, S):\n", | |
" options = [(k, func(k)) for k in S]\n", | |
" options.sort(key=lambda x:x[1])\n", | |
" return options[-1]\n", | |
"\n", | |
"def viterbi(s):\n", | |
" T1 = [{i: PI[i]*B[i][s[0]] for i in S}]\n", | |
" T2 = [{i: 'Start' for i in S}]\n", | |
" for c in s[1:]:\n", | |
" tmp1 = dict()\n", | |
" tmp2 = dict()\n", | |
" for i in S:\n", | |
" argmax, max_ = calc_max(lambda k: A[k][i] * T1[-1][k], S) \n", | |
" tmp1[i] = B[i][c] * max_\n", | |
" tmp2[i] = argmax\n", | |
" T1.append(tmp1)\n", | |
" T2.append(tmp2)\n", | |
" x = ['Ende']\n", | |
" for i in range(len(T1)-1, -1, -1):\n", | |
" x.append(T2[i][x[-1]])\n", | |
" print('Q = '+str(list(reversed(x))))\n", | |
" \n", | |
"viterbi('ABC')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### c)\n", | |
"\n", | |
"#### Onlinefrage Nr. 2: Welche Zeichenkette aus Teilaufgabe a) ist wahrscheinlicher?\n", | |
"1. $O_1$\n", | |
"2. $O_2$\n", | |
"3. **$O_3$** <--\n", | |
"4. $O_1$ und $O_2$\n", | |
"5. $O_2$ und $O_3$" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Aufgabe 3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model = \"\"\"w_{i-2},w_{i-1},wait,and,Tea,drink,</S>\n", | |
" n/a,<S>,0.5,0.25,0.1,0.1,0.05\n", | |
" <S>,wait,0.04,0.6,0.05,0.05,0.26\n", | |
" <S>,and,0.4,0.05,0.4,0.05,0.1\n", | |
" <S>,Tea,0.04,0.6,0.05,0.05,0.26\n", | |
" <S>,drink,0.04,0.5,0.25,0.05,0.16\n", | |
" wait,wait,0.2,0.2,0.25,0.2,0.15\n", | |
" wait,and,0.04,0.1,0.5,0.3,0.06\n", | |
" wait,Tea,0.1,0.5,0.2,0.1,0.1\n", | |
" wait,drink,0.04,0.35,0.4,0.1,0.11\n", | |
" and,wait,0.04,0.05,0.3,0.25,0.36\n", | |
" and,and,0.2,0.2,0.25,0.2,0.15\n", | |
" and,Tea,0.1,0.1,0.1,0.5,0.2\n", | |
" and,drink,0.04,0.05,0.6,0.1,0.21\n", | |
" Tea,wait,0.1,0.6,0.1,0.1,0.1\n", | |
" Tea,and,0.2,0.1,0.1,0.25,0.35\n", | |
" Tea,Tea,0.1,0.1,0.1,0.1,0.6\n", | |
" Tea,drink,0.2,0.2,0.2,0.25,0.15\n", | |
" drink,wait,0.1,0.3,0.1,0.1,0.4\n", | |
" drink,and,0.4,0.05,0.05,0.05,0.45\n", | |
" drink,Tea,0.1,0.3,0.1,0.1,0.4\n", | |
" drink,drink,0.2,0.2,0.25,0.2,0.15\n", | |
" \"\"\"\n", | |
"trigrams = {}\n", | |
"header = None\n", | |
"for line in model.split('\\n'):\n", | |
" cols = line.strip().split(',')\n", | |
" if not header:\n", | |
" header = cols\n", | |
" continue\n", | |
" for i in range(2, len(cols)):\n", | |
" trigram = (cols[0], cols[1], header[i])\n", | |
" trigrams[trigram] = float(cols[i])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## a) - d)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"a: 6.201736729460421\n", | |
"b: 4.6415888336127775\n", | |
"c: 5.8088430419725885\n", | |
"d: 5.865803381559204\n" | |
] | |
} | |
], | |
"source": [ | |
"from math import log2\n", | |
"def PPL(W):\n", | |
" W = [\"n/a\", \"<S>\"] + W.split(' ') + [\"</S>\"]\n", | |
" s = 0\n", | |
" for i in range(len(W)-2):\n", | |
" s += log2(trigrams[tuple(W[i:i+3])])\n", | |
" return 2 ** (-1/(len(W)-2)*s)\n", | |
"\n", | |
"inputs = [('a', 'Tea'), \n", | |
" ('b', 'drink Tea'), \n", | |
" ('c', 'Tea wait and drink'), \n", | |
" ('d', 'Tea drink and wait')]\n", | |
"for i, e in inputs:\n", | |
" print(f\"{i}: {PPL(e)}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## e)\n", | |
"\n", | |
"### Onlinefrage Nr. 3: Geben Sie die Sätze a)-d) nach aufsteigender Perplexität sortiert an.\n", | |
"1. abcd\n", | |
"2. **bcda**\n", | |
"3. acbd\n", | |
"4. bcad\n", | |
"5. cdab" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Aufgabe 4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ref = 'if there is no rain in April you will have a great summer'\n", | |
"h = ['no rain in april then great summer come',\n", | |
" 'there is rain in April you have summer',\n", | |
" 'in April no rain you have summer great',\n", | |
" 'there is no rain in apple a great summer comes',\n", | |
" 'you have a great summer comes if there is no rain in April']\n", | |
"\n", | |
"def levenshtein(a, b, sub_penalty=1):\n", | |
" prev = range(len(b) + 1)\n", | |
" for i, chr_a in enumerate(a):\n", | |
" curr = [i + 1]\n", | |
" for j, chr_b in enumerate(b):\n", | |
" ins = prev[j + 1] + 1\n", | |
" del_ = curr[j] + 1\n", | |
" sub = prev[j] + (chr_a != chr_b) * sub_penalty\n", | |
" curr.append(min(ins, del_, sub))\n", | |
" prev = curr\n", | |
" return prev[-1]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## a)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 9\n", | |
"2 5\n", | |
"3 8\n", | |
"4 6\n", | |
"5 12\n" | |
] | |
} | |
], | |
"source": [ | |
"for i, sample in enumerate(h):\n", | |
" print(i+1, levenshtein(sample.split(' '), ref.split(' ')))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## b)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 31\n", | |
"2 19\n", | |
"3 33\n", | |
"4 25\n", | |
"5 46\n" | |
] | |
} | |
], | |
"source": [ | |
"for i, sample in enumerate(h):\n", | |
" print(i+1, levenshtein(sample, ref))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## c)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 11\n", | |
"2 5\n", | |
"3 11\n", | |
"4 7\n", | |
"5 12\n" | |
] | |
} | |
], | |
"source": [ | |
"for i, sample in enumerate(h):\n", | |
" print(i+1, levenshtein(sample.split(' '), ref.split(' '), sub_penalty=2))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Onlinefrage Nr. 4: ... Welche Hypothese ist nun der Referenz am ähnlichsten (gemessen in Editierdistanz auf Wortebene), und was ist die entsprechende Editierdistanz?\n", | |
"1. 4, 6\n", | |
"2. **2, 5**\n", | |
"3. 3, 8\n", | |
"4. 1, 6\n", | |
"5. 2, 6\n", | |
"6. 3, 5" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.0" | |
}, | |
"toc": { | |
"colors": { | |
"hover_highlight": "#DAA520", | |
"running_highlight": "#FF0000", | |
"selected_highlight": "#FFD700" | |
}, | |
"moveMenuLeft": true, | |
"nav_menu": { | |
"height": "344px", | |
"width": "252px" | |
}, | |
"navigate_menu": true, | |
"number_sections": true, | |
"sideBar": true, | |
"threshold": 4, | |
"toc_cell": false, | |
"toc_section_display": "block", | |
"toc_window_display": false, | |
"widenNotebook": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment