Last active
December 25, 2015 01:09
-
-
Save haje01/6892583 to your computer and use it in GitHub Desktop.
집단지성프로그래밍 3장 for 파이썬 코리아 강남 스터디
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# \uad70\uc9d1\ubc1c\uacac\n", | |
| "\n", | |
| "2\uc7a5\uc758 \uc544\uc774\ub514\uc5b4\ub97c \uc774\uc6a9\ud574\uc11c \ub300\uaddc\ubaa8 \ub370\uc774\ud130 \uc138\ud2b8\uc5d0\uc11c \uc720\uc0ac\ud56d\ubaa9\uc744 \uac00\uc9c4 \uadf8\ub8f9\uc744 \uc790\ub3d9\uc73c\ub85c \ucd94\ucd9c\ud558\ub294 \uae30\ubc95\ub4e4\n", | |
| "\n", | |
| "**\ub370\uc774\ud130\uad70\uc9d1(data-clustering)**\uc740 \ubc00\uc811\ud788 \uad00\ub828\ub41c \uc0ac\ub78c, \uc0ac\ubb3c \uc544\uc774\ub514\uc5b4\ub4e4\uc758 \uadf8\ub8f9\uc744 \ucc3e\uace0 \uc2dc\uac01\ud654\ud558\ub294 \uae30\ubc95\uc774\ub2e4.\n", | |
| "\n", | |
| "\n", | |
| "\uace0\uac1d\uc774 \uad6c\ub9e4\ud55c \uc0c1\ud488\uc744 \uae30\ub85d\ud574 \uc77c\ubc18\uc801\uc778 \uc778\uad6c \ud1b5\uacc4\uc801 \uc815\ubcf4\uc640 \uc774 \uad6c\ub9e4 \uc815\ubcf4\ub97c \ud1b5\ud574 \uc720\uc0ac\ud55c \uad6c\ub9e4 \ud615\ud0dc\ub97c \uac00\uc9c4 \uadf8\ub8f9\ub4e4\uc744 \ucd94\ucd9c \uac00\ub2a5\n", | |
| "\n", | |
| "\uc0ac\ub840:\n", | |
| "\n", | |
| " * \ud328\uc158 \uc12c(fashion island)\n", | |
| " * \uc720\uc804\uc790 \uadf8\ub8f9 \ubc1c\uad74\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 1. \uac10\ub3c5\ub300 \ubb34\uac10\ub3c5 \ud559\uc2b5\n", | |
| "\n", | |
| "\uac10\ub3c5\ud559\uc2b5\uae30\ubc95 (Supervised learning method) - \uc608\uc81c \uc785\ucd9c\ub825\uc744 \uc0ac\uc6a9\ud574\uc11c \uc608\uce21\ud558\ub294 \ubc29\ubc95\uc744 \ud559\uc2b5\ud558\ub294 \uae30\ubc95, \uc2e0\uacbd\ub9dd, \uacb0\uc815\ud2b8\ub9ac \ub4f1\n", | |
| "\n", | |
| "\uad70\uc9d1\uc740 \ubb34\uac10\ub3c5 \ud559\uc2b5(unsupervised learning)\uc758 \ud55c \uc608\ub2e4. \uc62c\ubc14\ub978 \ub2f5\uc744 \ucc3e\ub294 \uac83\uc774 \uc544\ub2c8\ub77c \ub370\uc774\ud130 \uc9d1\ud569 \ub0b4\uc5d0\uc11c \uad6c\uc870\ub97c \ucc3e\ub294 \uac83 " | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 2. \ub2e8\uc5b4 \ubca1\ud130\n", | |
| "\n", | |
| "### \ube14\ub85c\uac70 \ubd84\ub958\n", | |
| "\n", | |
| "\uc0c1\uc704 \ube14\ub85c\uac70 120\uba85\uc758 \ube14\ub85c\uadf8\uc5d0\uc11c \ub4f1\uc7a5\ud55c \ud2b9\uc815 \ub2e8\uc5b4\ub4e4\uc758 \ud69f\uc218\n", | |
| "\n", | |
| "\ube44\uc2b7\ud55c \uc2a4\ud0c0\uc77c\uc744 \uac00\uc9c0\uac70\ub098 \uc720\uc0ac \uc8fc\uc81c\uc5d0 \ub300\ud55c \uae00\uc774 \uc790\uc8fc \uc62c\ub77c\uc624\ub294 \ube14\ub85c\uadf8 \uadf8\ub8f9\ub4e4\uc744 \ucc3e\uc744 \uc218 \uc788\ub2e4.\n", | |
| "\n", | |
| "\uc774 \uadf8\ub8f9\uc744 \ud1b5\ud574 \uc628\ub77c\uc778\uc758 \uac70\ub300 \ube14\ub85c\uadf8\uc5d0 \ub300\ud55c \uac80\uc0c9, \uc0c9\uc778, \ubc1c\uad74\uc5d0 \ub3c4\uc6c0\uc774 \ub41c\ub2e4.\n", | |
| "\n", | |
| "### \ud53c\ub4dc\ub0b4 \ub2e8\uc5b4 \uc138\uae30" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ube14\ub85c\uadf8\ub294 RSS\ud53c\ub4dc\ub97c feedparser\ub85c \ubd84\uc11d\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import feedparser\n", | |
| "import re\n", | |
| "\n", | |
| "# \ud55c RSS\uc548\uc5d0 \uc788\ub294 \uc81c\ubaa9\uacfc \ub2e8\uc5b4 \ucd9c\ud604 \ud69f\uc218 \ub515\uc154\ub108\ub9ac\ub97c \ub9ac\ud134\ud568\n", | |
| "def getwordcounts(url):\n", | |
| " # \ud53c\ub4dc\ub97c \ud30c\uc2f1\ud568\n", | |
| " d = feedparser.parse(url)\n", | |
| " wc = {}\n", | |
| " \n", | |
| " # \ubaa8\ub4e0 \uac8c\uc2dc\uae00\ubcc4\ub85c \ub8e8\ud504\ub97c \ub3d4\n", | |
| " for e in d.entries:\n", | |
| " if 'summary' in e:\n", | |
| " summary = e.summary\n", | |
| " else:\n", | |
| " summary = e.description\n", | |
| " \n", | |
| " # \ub2e8\uc5b4 \ubaa9\ub85d\uc744 \ucd94\ucd9c\ud568\n", | |
| " words = getwords(e.title + ' ' + summary)\n", | |
| " for word in words:\n", | |
| " wc.setdefault(word, 0)\n", | |
| " wc[word] += 1\n", | |
| " return d.feed.title, wc" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "RSS\uc640 Atom\ud53c\ub4dc\uc5d0\uc11c \uc694\uc57d\ubb38\uc744 \uad6c\ud558\uace0 \uc774\uc5d0 \ub300\ud574 getwords \ud568\uc218\ub97c \ud638\ucd9c\ud55c\ub2e4" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def getwords(html):\n", | |
| " # \ubaa8\ub4e0 HTML\ud0dc\uadf8\ub97c \uc81c\uac70\ud568\n", | |
| " txt = re.compile(r'<[^>]+>').sub('', html)\n", | |
| " # \ube44-\uc54c\ud30c\uba67 \ubb38\uc790\ub4e4\ub85c \ub2e8\uc5b4\ub97c \ubd84\ub9ac\ud568\n", | |
| " words = re.compile(r'[^A-Z^a-z]+').split(txt)\n", | |
| " # \uc18c\ubb38\uc790\ub85c \ubcc0\ud658\ud568\n", | |
| " return [word.lower() for word in words if word != '']" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uac00\uc7a5 \ub9ce\uc774 \ucc38\uc870\ub41c \ube14\ub85c\uadf8\ub4e4\uc5d0 \ub300\ud55c \uc8fc\uc18c feedlist.txt\ub97c \uc77d\uc5b4\uc11c \uc0ac\uc6a9" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "apcount = {}\n", | |
| "wordcounts = {}\n", | |
| "feedlist = []\n", | |
| "\n", | |
| "for feedurl in file('feedlist.txt'):\n", | |
| " try: # <-- \ucd94\uac00\ub41c \uc608\uc678 \ucc98\ub9ac\n", | |
| " feedlist.append(feedurl)\n", | |
| " title, wc = getwordcounts(feedurl)\n", | |
| " if title is not None:\n", | |
| " wordcounts[title] = wc\n", | |
| " for word, count in wc.items():\n", | |
| " apcount.setdefault(word, 0)\n", | |
| " if count > 1:\n", | |
| " apcount[word] += 1\n", | |
| " except Exception, e:\n", | |
| " print 'Failed to parse feed %s' % feedurl, str(e)\n", | |
| "print 'done'" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Failed to parse feed http://blogs.abcnews.com/theblotter/index.rdf\r\n", | |
| "object has no attribute 'title'\n", | |
| "Failed to parse feed http://featured.gigaom.com/feed/\r\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "object has no attribute 'title'\n", | |
| "Failed to parse feed http://powerlineblog.com/index.rdf\r\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "!head feedlist.txt" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ube14\ub85c\uadf8\ubcc4\ub85c \uc9d1\uacc4\ud560 \ub2e8\uc5b4 \ubaa9\ub85d\uc744 \uc0dd\uc131\ud55c\ub2e4. \ucd5c\uc18c\uc640 \ucd5c\ub300 \ube44\uc911 \ub0b4\uc5d0 \uc788\ub294 \ub2e8\uc5b4\ub4e4\ub9cc \uc120\ud0dd\ud558\ub294 \ubc29\uc2dd (10%\uc774\uc0c1 50% \uc774\ud558)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "wordlist = []\n", | |
| "\n", | |
| "for w, bc in apcount.items():\n", | |
| " frac = float(bc) / len(feedlist)\n", | |
| " if frac > 0.1 and frac < 0.5:\n", | |
| " wordlist.append(w)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ub9c8\uc9c0\ub9c9\uc73c\ub85c \ub2e8\uc5b4 \ubaa9\ub85d\uacfc \ube14\ub85c\uadf8 \ubaa9\ub85d\uc744 \uc0ac\uc6a9\ud558\uc5ec \uac01 \ube14\ub85c\uadf8\uc5d0 \uc788\ub294 \ubaa8\ub4e0 \ub2e8\uc5b4\ub4e4\uc758 \ud69f\uc218\ub97c \ub2f4\uace0 \uc788\ub294 \uac70\ub300\ud55c \ud589\ub82c\uc744 \ub9cc\ub4e4\uc5b4 \uc800\uc7a5" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "out = file('blogdata.txt', 'wt') # <-- 't' \ucd94\uac00\n", | |
| "out.write('Blog')\n", | |
| "\n", | |
| "for word in wordlist:\n", | |
| " out.write('\\t%s' % word)\n", | |
| "out.write('\\n')\n", | |
| "\n", | |
| "for blog, wc in wordcounts.items():\n", | |
| " out.write(blog)\n", | |
| " for word in wordlist:\n", | |
| " if word in wc:\n", | |
| " out.write('\\t%d' % wc[word])\n", | |
| " else:\n", | |
| " out.write('\\t0')\n", | |
| " out.write('\\n')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 297 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 3. \uacc4\uce35\uc801 \uad70\uc9d1\ud654\n", | |
| "\n", | |
| "\uac00\uc7a5 \uc720\uc0ac\ud55c \ub450 \uadf8\ub8f9\uc744 \uacc4\uc18d \ubcd1\ud569\ud558\ub294 \ubc29\uc2dd\uc73c\ub85c \uadf8\ub8f9 \uacc4\uce35\uc744 \ub9cc\ub4e0\ub2e4.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc5f0\uacb0\uc120\uc740 \uc5b4\ub5a4 \ud56d\ubaa9\ub4e4\uc774 \uac01 \uad70\uc9d1\uc758 \ub05d\ub2e8\uc5d0 \uc788\ub294\uc9c0\ub97c \ub098\ud0c0\ub0bc \ubfd0\ub9cc \uc544\ub2c8\ub77c \uac01 \ud56d\ubaa9\uac04\uc758 \ub5a8\uc5b4\uc9c4 \uac70\ub9ac\ub3c4 \ub73b\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc774\uc81c \ube14\ub85c\uadf8 \ub370\uc774\ud130 \uc138\ud2b8\ub97c \uad70\uc9d1\ud654\ud574\uc11c \uacc4\uce35\ub3c4\ub97c \ub9cc\ub4e0\ub2e4. \uc798\ub418\uba74 **\uc8fc\uc81c\ubcc4 \uadf8\ub8f9**\uc774 \ub9cc\ub4e4\uc5b4 \uc9c8 \uac83\uc774\ub2e4. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def readfile(filename):\n", | |
| " lines = [line for line in file(filename)]\n", | |
| " \n", | |
| " # \uccab\ubc88\uc9f8 \uac00\ub85c\uc904\uc740 \uc138\ub85c\uc904 \uc81c\ubaa9\uc784\n", | |
| " colnames = lines[0].strip().split('\\t')[1:]\n", | |
| " rownames = []\n", | |
| " data = []\n", | |
| " for line in lines[1:]:\n", | |
| " p = line.strip().split('\\t')\n", | |
| " # \uac01 \uac00\ub85c\uc904\uc758 \uccab \ubc88\uc9f8 \uc138\ub85c\uc904\uc740 \uac00\ub85c\uc904 \uc774\ub984\uc784\n", | |
| " rownames.append(p[0])\n", | |
| " # \uac00\ub85c\uc904\uc758 \ub098\uba38\uc9c0\uac00 \ub370\uc774\ud130\n", | |
| " data.append([float(x) for x in p[1:]])\n", | |
| " return rownames, colnames, data" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc774 \ud568\uc218\ub294 \uccab\ubc88\uc9f8 \uac00\ub85c\uc904\uc5d0 \uc138\ub85c\uc904 \uc81c\ubaa9(\ube14\ub85c\uadf8 \uba85)\uc744 \ub123\uace0 \uccab \ubc88\uc9f8 \uc138\ub85c\uc904\uc5d0 \uac00\ub85c\uc904 \uc774\ub984(\ub2e8\uc5b4)\ub97c \ub123\uace0, \uadf8\uc904\uc758 \ub098\uba38\uc9c0\uc5d0 \ub370\uc774\ud130\ub97c \ub123\ub294\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ub2e4\uc74c\uc73c\ub85c \uadfc\uc811\ub3c4\ub97c \uc815\uc758\ud55c\ub2e4. \ub2e8\uc5b4 \uc218\uc758 \ucc28\uc774\uac00 \uc788\uae30\uc5d0 \ud53c\uc5b4\uc2a8 \uacc4\uc218\uac00 \ub354 \uc801\ub2f9\ud558\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from math import sqrt\n", | |
| "\n", | |
| "def pearson(v1, v2):\n", | |
| " # \ub2e8\uc21c\ud55c \ud569 \uacc4\uc0b0\n", | |
| " sum1 = sum(v1)\n", | |
| " sum2 = sum(v2)\n", | |
| " \n", | |
| " # \uc81c\uacf1\uc758 \ud569 \uacc4\uc0b0\n", | |
| " sum1Sq = sum([pow(v, 2) for v in v1])\n", | |
| " sum2Sq = sum([pow(v, 2) for v in v2])\n", | |
| " \n", | |
| " # \uacf1\uc758 \ud569 \uacc4\uc0b0\n", | |
| " pSum = sum([v1[i]*v2[i] for i in r])\n", | |
| " \n", | |
| " # \ud53c\uc5b4\uc2a8 \uacc4\uc218 r\uc758 \uacc4\uc0b0\n", | |
| " num = pSum - (sum1 * sum2 / len(v1))\n", | |
| " den = sqrt((sum1Sq-pow(sum1, 2)/len(v1)) * (sum2Sq-pow(sum2, 2)/len(v1)))\n", | |
| " if den == 0: return 0\n", | |
| " \n", | |
| " return 1 - num/den" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ud53c\uc5b4\uc2a8 \uacc4\uc218\uac00 1\uc774\uba74 \uc644\uc804\ud788 \uc77c\uce58\ud558\ub294 \uacbd\uc6b0, 0\uc774\uba74 \uc804\ud61c \uad00\uacc4\uac00 \uc5c6\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uacc4\uce35 \ud2b8\ub9ac\ub97c \uad6c\uc131\ud558\uae30 \uc704\ud574 \ub178\ub4dc \ud074\ub798\uc2a4\ub97c \ub9cc\ub4e0\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "class bicluster:\n", | |
| " def __init__(self, vec, left=None, right=None, distance=0.0, id=None):\n", | |
| " self.left = left\n", | |
| " self.right = right\n", | |
| " self.vec = vec\n", | |
| " self.id = id\n", | |
| " self.distance = distance" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uac01 \uad70\uc9d1\uc740 \uc704\uce58 \ub370\uc774\ud130\ub97c \uac00\uc9c4\ub2e4. \uc704\uce58\ub294 \uc885\uc810 \ud639\uc740 \ub2e4\ub978 \uc720\ud615\uc758 \ube0c\ub79c\uce58\uc5d0\uc11c \ubcd1\ud569\ub41c \ub370\uc774\ud130\uc774\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc6d0\ub798\uc758 \ud56d\ubaa9\ub4e4\ub9cc\uc744 \uc21c\ud68c\ud574 \uac00\uc7a5 \uc720\uc0ac\ud55c(\uac00\uae4c\uc6b4) \ub450\uac1c\ub97c \ucc3e\uc544 \ud558\ub098\uc758 \uad70\uc9d1\uc744 \ub9cc\ub4e0\ub2e4. \ub9cc\ub4e4\uc5b4\uc9c4 \uad70\uc9d1\uc740 \ub450 \ud56d\ubaa9\uc758 \ud3c9\uade0\uac12\uc744 \uac00\uc9c4\ub2e4. \ud558\ub098\uc758 \uad70\uc9d1\ub9cc \ub0a8\uc744 \ub54c\uae4c\uc9c0 \uc55e\uc758 \uacfc\uc815\uc744 \ubc18\ubcf5\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def hcluster(rows, distance = pearson):\n", | |
| " distances = {}\n", | |
| " currentclustid = -1\n", | |
| " \n", | |
| " # \ucd08\uae30 \uad70\uc9d1\ub4e4\uc744 \uac01 \uac00\ub85c\uc904\uc5d0\uc11c \uc0dd\uc131\ud568\n", | |
| " clust = [bicluster(rows[i], id = i) for i in range(len(rows))]\n", | |
| " \n", | |
| " while len(clust) > 1:\n", | |
| " lowestpair = (0, 1)\n", | |
| " closest = distance(clust[0].vec, clust[1].vec)\n", | |
| " \n", | |
| " # \uac00\uc7a5 \uc791\uc740 \uac70\ub9ac \uac12\uc744 \uac00\uc9c0\ub294 \uc30d\uc744 \ucc3e\ub294 \ub8e8\ud504\n", | |
| " for i in range(len(clust)):\n", | |
| " for j in range(i+1, len(clust)):\n", | |
| " # distance\ub294 \uac70\ub9ac \uacc4\uc0b0 \uce90\uc2dc\n", | |
| " if (clust[i].id, clust[j].id) not in distances:\n", | |
| " distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)\n", | |
| " d = distances[(clust[i].id, clust[j].id)]\n", | |
| " \n", | |
| " if d < closest:\n", | |
| " closest = d\n", | |
| " lowestpair = (i, j)\n", | |
| " \n", | |
| " # \ub450 \uad70\uc9d1\uac04 \ud3c9\uade0\uc744 \uacc4\uc0b0\ud568\n", | |
| " mergevec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i])/2.0\n", | |
| " for i in range(len(clust[0].vec))]\n", | |
| " \n", | |
| " # \uc0c8\ub85c\uc6b4 \uad70\uc9d1\uc744 \uc0dd\uc131\ud568\n", | |
| " newcluster = bicluster(mergevec, left = clust[lowestpair[0]], \n", | |
| " right = clust[lowestpair[1]],\n", | |
| " distance = closest, id = currentclustid)\n", | |
| " \n", | |
| " # \uc6d0\ub798\uc758 \uc9d1\ud569 \uc548\uc5d0 \ud3ec\ud568\ub418\uc9c0 \uc54a\uc740 \uad70\uc9d1 id\ub4e4\uc740 \uc74c\uc218\uc784\n", | |
| " currentclustid -= 1\n", | |
| " del clust[lowestpair[1]]\n", | |
| " del clust[lowestpair[0]]\n", | |
| " clust.append(newcluster)\n", | |
| " \n", | |
| " return clust[0]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": "*" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uac01 \uad70\uc9d1\uc740 \uc790\uc2e0\uc744 \ub9cc\ub4e4\uae30 \uc704\ud574 \ubcd1\ud569\ub41c \ub450 \uac1c \uad70\uc9d1\ub4e4\uc744 \ucc38\uace0\ud558\uae30 \ub54c\ubb38\uc5d0 \uc774 \ud568\uc218\uc5d0\uc11c \ub9ac\ud134\ub41c \ub9c8\uc9c0\ub9c9 \uad70\uc9d1\uc73c\ub85c \ubaa8\ub4e0 \uad70\uc9d1\ub4e4\uacfc \ub05d\ub2e8 \ub178\ub4dc\ub4e4\uc744 \uc7ac\uadc0\uc801\uc73c\ub85c \uc7ac\uc0dd\uc131\ud560 \uc218 \uc788\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "blognames, words, data = readfile('blogdata.txt')\n", | |
| "clust = hcluster(data)\n", | |
| "print clust" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'readfile' is not defined", | |
| "output_type": "pyerr", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-1-c90f3eab1e05>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mblognames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreadfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'blogdata.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhcluster\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mclust\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mNameError\u001b[0m: name 'readfile' is not defined" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uacb0\uacfc\uac00 \ub098\uc624\uba74 \ub2e4\uc74c\uacfc \uac19\uc740 \uac04\ub2e8\ud55c \ud568\uc218\ub97c \ud1b5\ud574 \uacc4\uce35\uc744 \ucd9c\ub825\ud560 \uc218 \uc788\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def printclust(clust, labels=None, n=3):\n", | |
| " # \uacc4\uce35\uad6c\uc870\ub97c \ub9cc\ub4e4\uae30 \uc704\ud574 \ub4e4\uc5ec \uc500\n", | |
| " for i in range(n):\n", | |
| " print ' ',\n", | |
| " if clust.id < 0:\n", | |
| " # \uc74c\uc218 id \uac12\uc740 \ud2b8\ub9ac\uc758 \ube0c\ub79c\uce58\ub97c \ub73b\ud568\n", | |
| " print '-'\n", | |
| " else:\n", | |
| " # \uc591\uc218 id \uac12\uc740 \ud2b8\ub9ac\uc758 \uc885\uc810\uc744 \ub73b\ud568\n", | |
| " if labels == None:\n", | |
| " print clust.id\n", | |
| " else:\n", | |
| " print labels[clust.id]\n", | |
| " \n", | |
| " # \uc6b0\uce21\uacfc \uc88c\uce21 \ube0c\ub79c\uce58\ub97c \ucd9c\ub825\n", | |
| " if clust.left != None:\n", | |
| " printclust(clust.left, labels=labels, n=n+1)\n", | |
| " if clust.right != None:\n", | |
| " printclust(clust.right, labels=labels, n=n+1)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 261 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc774 \uad70\uc9d1\ub4e4\uc5d0\uc11c \uacf5\ud1b5\uc810\uc744 \ubc1c\uacac\ud560 \uc218 \uc788\ub294\uac00?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 4. \uacc4\ud1b5\ub3c4 \ucd9c\ub825\n", | |
| "\n", | |
| "\uad70\uc9d1\ub4e4\uc744 \uacc4\ud1b5\ub3c4 \ud615\ud0dc\ub85c \uadf8\ub824\ubcf4\uba74 \ud574\uc11d\uc774 \ud6e8\uc52c \ud3b8\ub9ac\ud558\ub2e4. \uacc4\ud1b5\ub3c4\ub294 \uc791\uc740 \uacf5\uac04\uc5d0 \ube44\uad50\uc801 \ub9ce\uc740 \uc815\ubcf4\ub97c \ub2f4\uc744 \uc218 \uc788\ub2e4.\n", | |
| "\n", | |
| "\uba3c\uc800, \uc8fc\uc5b4\uc9c4 \uad70\uc9d1\uc758 \uc804\uccb4 \ub192\uc774\ub97c \ub9ac\ud134\ud558\ub294 \ud568\uc218\ub97c \ub9cc\ub4e0\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from PIL import Image, ImageDraw\n", | |
| "\n", | |
| "def getheight(clust):\n", | |
| " # \uc885\uc810\uc778 \uacbd\uc6b0 \ub192\uc774\ub294 1\uc784\n", | |
| " if clust.left == None and clust.right == None:\n", | |
| " return 1\n", | |
| " \n", | |
| " # \uadf8\ub807\uc9c0 \uc54a\uc73c\uba74 \ub192\uc774\ub294 \uac01 \ube0c\ub79c\uce58 \ub192\uc774\ub4e4\uc758 \ud569 \n", | |
| " return getheight(clust.left) + getheight(clust.right)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 263 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ub2e4\uc74c\uc73c\ub85c \ucd5c\uc0c1\uc704 \ub8e8\ud2b8 \ub178\ub4dc\uc758 \uae4a\uc774\ub97c \uc54c\uc544\uc57c \ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def getdepth(clust):\n", | |
| " # \uc885\uc810 \uac70\ub9ac\ub294 0.0\uc784\n", | |
| " if clust.left == None and clust.right == None:\n", | |
| " return 0\n", | |
| " \n", | |
| " # \ube0c\ub79c\uce58\uc758 \uac70\ub9ac\ub294 \uc591\ucabd \uc911 \ud070 \uac83\uc5d0 \uc790\uc2e0\uc758 \uac70\ub9ac\ub97c \ub354\ud55c \uac12\uc784\n", | |
| " return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 264 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "drawdendrogram \ud568\uc218\ub294 \ucd5c\uc885 \uad70\uc9d1\ub9c8\ub2e4 \ub192\uc774 20\ud53d\uc140\uacfc \uace0\uc815\ub41c \ud3ed\uc744 \uac00\uc9c4 \uc774\ubbf8\uc9c0\ub97c \uc0dd\uc131\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# jpeng -> png\ub85c \uc218\uc815\n", | |
| "def drawdendrogram(clust, labels, png='clusters.png'):\n", | |
| " # \ub192\uc774\uc640 \ud3ed\n", | |
| " h = getheight(clust) * 20\n", | |
| " w = 1200\n", | |
| " depth = getdepth(clust)\n", | |
| " \n", | |
| " # \uace0\uc815 \ud3ed\uc5d0 \ub9de\uac8c \ube44\uc728 \uc870\uc815\n", | |
| " scaling = float(w-150) / depth\n", | |
| " \n", | |
| " # \ud770\uc0c9 \ubc30\uacbd\uc758 \uc0c8\ub85c\uc6b4 \uc774\ubbf8\uc9c0 \uc0dd\uc131\n", | |
| " img = Image.new('RGB', (w, h), (255, 255, 255))\n", | |
| " draw = ImageDraw.Draw(img)\n", | |
| " \n", | |
| " draw.line((0, h/2, 10, h/2), fill=(255, 0, 0))\n", | |
| " \n", | |
| " # \uccab \ubc88\uc9f8 \ub178\ub4dc \uadf8\ub9bc\n", | |
| " drawnode(draw, clust, 10, (h/2), scaling, labels)\n", | |
| " img.save(png, 'PNG')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 265 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "drawnode \ud568\uc218\ub294 \uc790\uc2dd \ub178\ub4dc\ub4e4\uc758 \ub192\uc774\ub97c \uacc4\uc0b0\ud558\uc5ec \uc790\uc2e0\uc774 \uc5b4\ub514\uc5d0 \uc788\uc5b4\uc57c \ud558\ub294\uc9c0 \uc704\uce58\ub97c \uacc4\uc0b0\ud55c\ub2e4. \ud55c \uac1c\uc758 \uae34 \uc218\uc9c1\uc120\uacfc \ub450\uac1c\uc758 \uc218\ud3c9\uc120\uc744 \uadf8\ub9b0\ub2e4. \uc218\ud3c9\uc120 \uae38\uc774\ub294 \uad70\uc9d1 \ub0b4 \uc624\ub958 \uc815\ub3c4\uc5d0 \uc758\ud574 \uacb0\uc815\ub41c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def drawnode(draw, clust, x, y, scaling, labels):\n", | |
| " if clust.id < 0:\n", | |
| " h1 = getheight(clust.left) * 20\n", | |
| " h2 = getheight(clust.right) * 20\n", | |
| " top = y - (h1 + h2)/2\n", | |
| " bottom = y + (h1 + h2)/2\n", | |
| " \n", | |
| " # \uc120\uae38\uc774\n", | |
| " ll = clust.distance * scaling\n", | |
| " \n", | |
| " # \uc774 \uad70\uc9d1\uc5d0\uc11c \uc790\uc2dd\ub4e4\uae4c\uc9c0\uc758 \uc218\uc9c1\uc120\n", | |
| " draw.line((x, top + h1/2, x, bottom - h2/2), fill=(255, 0, 0))\n", | |
| " \n", | |
| " # \uc67c\ucabd \ud56d\ubaa9\uae4c\uc9c0 \uc218\ud3c9\uc120\n", | |
| " draw.line((x, top + h1/2, x + ll, top + h1/2), fill=(255, 0, 0))\n", | |
| " \n", | |
| " # \uc67c\ucabd \ud56d\ubaa9\uae4c\uc9c0 \uc218\ud3c9\uc120\n", | |
| " draw.line((x, bottom - h2/2, x + ll, bottom - h2/2), fill=(255, 0, 0))\n", | |
| " \n", | |
| " # \uc774 \ud568\uc218\ub85c \uc67c\ucabd \uc624\ub978\ucabd \ub178\ub4dc\ub97c \uadf8\ub9bc\n", | |
| " drawnode(draw, clust.left, x + ll, top + h1/2, scaling, labels)\n", | |
| " drawnode(draw, clust.right, x + ll, bottom - h2/2, scaling, labels)\n", | |
| " else:\n", | |
| " # \uc885\uc810\uc774\uba74 \ud56d\ubaa9 \ub77c\ubca8\uc744 \uadf8\ub9bc\n", | |
| " draw.text((x + 5, y - 7), labels[clust.id], (0, 0, 0))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 266 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "drawdendrogram(clust, blognames, png='blogclust.png')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 267 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 5. \uc138\ub85c\uc904 \uad70\uc9d1\ud654\n", | |
| "\n", | |
| "\ube14\ub85c\uadf8 \ub370\uc774\ud130 \uc138\ud2b8\uc5d0\uc11c \uc138\ub85c\uc904\uc740 \ub2e8\uc5b4\uc778\ub370, \ud568\uaed8 \uc0ac\uc6a9\ub418\ub294 \ub2e8\uc5b4\ub4e4\uc744 \ubaa8\uc544\ubcf4\ub294 \uac83\ub3c4 \uc7ac\ubbf8\uc788\uc744 \uac83 \uac19\ub2e4. \uc544\ub798 \ud568\uc218\ub85c \uc138\ub85c\uc904(\ub2e8\uc5b4)\uc774 \uac00\ub85c\uc904\uc774 \ub418\ub3c4\ub85d \ub370\uc774\ud130 \uc138\ud2b8\ub97c \ud68c\uc804\uc2dc\ud0a8\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def rotatematrix(data):\n", | |
| " newdata = []\n", | |
| " for i in range(len(data[0])):\n", | |
| " newrow = [data[j][i] for j in range(len(data))]\n", | |
| " newdata.append(newrow)\n", | |
| " return newdata" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 268 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ub370\uc774\ud130\ub97c \ud68c\uc804 \ud6c4 \uc544\ub798\uc640 \uac19\uc774 \uad70\uc9d1\ub3c4\ub97c \ub9cc\ub4e0\ub2e4. \ube14\ub85c\uadf8\uc5d0 \ube44\ud574 \ub2e8\uc5b4 \uc218\uac00 \ud6e8\uc52c \ub9ce\uc544\uc11c \uc2dc\uac04\uc774 \ub354 \uac78\ub9b0\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "rdate = rotatematrix(data)\n", | |
| "wordclust = hcluster(data)\n", | |
| "drawdendrogram(wordclust, labels=words, png='wordclust.png')\n", | |
| "print 'done'" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "done\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 269 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 6. K\ud3c9\uade0 \uad70\uc9d1\ud654\n", | |
| "\n", | |
| "\uacc4\uce35\uc801 \uad70\uc9d1\ud654 \uae30\ubc95\uc758 \ud2b8\ub9ac \uad6c\uc870\ub85c\ub294 **\ub370\uc774\ud130\ub97c \ub69c\ub838\ud55c \uadf8\ub8f9\uc73c\ub85c \ucabc\uac1c\uc9c0 \ubabb\ud55c\ub2e4**. \ub610 \ub9ce\uc740 \uacc4\uc0b0\uc744 \ud544\uc694\ub85c \ud55c\ub2e4. \uc774\uc5d0 \ub300\ud55c \ub300\uc548\uc73c\ub85c **K\ud3c9\uade0 \uad70\uc9d1\ud654(K-means clustering)** \uae30\ubc95\uc774 \uc788\ub2e4. \ub370\uc774\ud130 \uad6c\uc870\uc5d0 \uadfc\uac70\ud574\uc11c \uad70\uc9d1\uc758 \ud06c\uae30\ub97c \ubbf8\ub9ac \uacb0\uc815\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "1. \ubb34\uc791\uc704\ub85c \uc120\uc815\ub41c k\uac1c\uc758 \uc911\uc2ec\uc810(centroid: \uad70\uc9d1 \uc911\uc2ec)\uc744 \uc120\uc815\n", | |
| "2. \uc774 \uc810\uc5d0\uc11c \uac00\uc7a5 \uadfc\uc811\ud55c \ud56d\ubaa9\ub4e4\uc744 \ud560\ub2f9\n", | |
| "3. \ud560\ub2f9\ub41c \ubaa8\ub4e0 \ub178\ub4dc\ub4e4\uc758 \ud3c9\uade0\uc704\uce58\ub85c \uc911\uc2ec\uc810\uc744 \uc774\ub3d9\n", | |
| "4. \ud560\ub2f9\uc774 \ub354 \uc5c6\uc744 \ub54c\uae4c\uc9c0 \uc7ac\ud560\ub2f9\uc744 \uc218\ud589" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import random\n", | |
| "\n", | |
| "def kcluster(rows, distance=pearson, k=4):\n", | |
| " # \uac01 \uc810\uc758 \ucd5c\ub300, \ucd5c\uc18c\uac12\uc744 \uad6c\ud568\n", | |
| " ranges = [\n", | |
| " (min([row[ri] for row in rows]), max([row[ri] for row in rows]))\n", | |
| " for ri in range(len(rows[0]))\n", | |
| " ]\n", | |
| " \n", | |
| " # \uc784\uc758\ub85c \uc120\uc815\ud55c k\uac1c\uc758 \uc911\uc2ec\uc810\uc744 \uc0dd\uc131\n", | |
| " clusters = [\n", | |
| " [\n", | |
| " random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0]\n", | |
| " for i in range(len(rows[0]))\n", | |
| " ] \n", | |
| " for ci in range(k)\n", | |
| " ]\n", | |
| " \n", | |
| " lastmatches = None\n", | |
| " for t in range(100):\n", | |
| " print 'Iteration %d' % t\n", | |
| " bestmatches = [[] for i in range(k)]\n", | |
| " \n", | |
| " # \uac01 \uac00\ub85c\uc904\ubcc4\ub85c \uac00\uc7a5 \uadfc\uc811\ud55c \uc911\uc2ec\uc810\uc744 \ucc3e\uc74c\n", | |
| " for ri in range(len(rows)):\n", | |
| " row = rows[ri]\n", | |
| " bestmatch = 0\n", | |
| " for ci in range(k):\n", | |
| " d = distance(clusters[ci], row)\n", | |
| " if d < distance(clusters[bestmatch], row):\n", | |
| " bestmatch = ci;\n", | |
| " bestmatches[bestmatch].append(ri)\n", | |
| " \n", | |
| " # \uc774\uc804\uacfc \uac19\uc740 \uacb0\uacfc\ub77c\uba74 \uc644\ub8cc\ud568\n", | |
| " if bestmatches == lastmatches:\n", | |
| " break\n", | |
| " lastmatches = bestmatches\n", | |
| " \n", | |
| " # \uc911\uc2ec\uc810\uc744 \uba64\ubc84\ub4e4\uc758 \ud3c9\uade0\uc73c\ub85c \uc774\ub3d9\ud568\n", | |
| " for ci in range(k):\n", | |
| " avgs = [0.0] * len(rows[0])\n", | |
| " if len(bestmatches[ci]) > 0:\n", | |
| " for rowid in bestmatches[ci]:\n", | |
| " for m in range(len(rows[rowid])):\n", | |
| " avgs[m] += rows[rowid][m]\n", | |
| " for j in range(len(avgs)):\n", | |
| " avgs[j] /= len(bestmatches[ci])\n", | |
| " clusters[ci] = avgs\n", | |
| " \n", | |
| " return bestmatches" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 270 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ucd5c\uc885 \uacb0\uacfc\ub97c \uc0dd\uc0b0\ud558\ub294 \ubc18\ubcf5 \ud69f\uc218\ub294 \uacc4\uce35\uc801 \uad70\uc9d1\ud654\uc5d0 \ube44\ud574 \uc791\uae30\uc5d0 \ub9e4\uc6b0 \ube68\ub9ac \ub3d9\uc791\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "K = 10\n", | |
| "kclust = kcluster(data, k=K)\n", | |
| "for i in range(K):\n", | |
| " print '----'\n", | |
| " print [blognames[r] for r in kclust[i]]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Iteration 0\n", | |
| "Iteration 1" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Iteration 2" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Iteration 3" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Iteration 4" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "----" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "['A Consuming Experience (full feed)', 'mezzoblue', 'Eschaton', 'we make money not art', 'PaulStamatiou.com - Tech News, Reviews and Guides', 'ongoing by Tim Bray', '456 Berea Street', 'Signal vs. Noise', 'Joel on Software', 'Online Marketing Report', 'Creating Passionate Users', 'Derek Powazek', \"Jeremy Zawodny's blog\", 'plasticbag.org', 'blog maverick', 'gapingvoid: \"cartoons drawn on the back of business cards\"', 'The Viral Garden', '43 Folders', 'MetaFilter', 'Lifehack', 'ShoeMoney', 'Oilman']\n", | |
| "----\n", | |
| "['Search Engine Roundtable', 'Matt Cutts: Gadgets, Google, and SEO', 'The Official Google Blog', 'Scobleizer', 'Search Engine Watch - Latest', 'Google Blogoscoped', \"O'Reilly Radar\", 'Google Operating System', 'Quick Online Tips']\n", | |
| "----\n", | |
| "['Schneier on Security', \"John Battelle's Search Blog\", 'Publishing 2.0', 'BuzzMachine', 'ReadWrite']\n", | |
| "----\n", | |
| "[\"SpikedHumor - Today's Videos and Pictures\"]\n", | |
| "----\n", | |
| "['TUAW - The Unofficial Apple Weblog']\n", | |
| "----\n", | |
| "['Copyblogger', 'Instapundit', 'Joho the Blog', 'NewsBusters - Exposing Liberal Media Bias', 'Bloggers Blog', \"Captain's Quarters\", 'Crooks and Liars', 'ThinkProgress', 'The Dish', 'Mashable', 'Michelle Malkin']\n", | |
| "----\n", | |
| "['SimpleBits', 'How to Change the World', \"Joi Ito's Web\", 'flagrantdisregard', '@ProBlogger']\n", | |
| "----\n", | |
| "['Engadget RSS Feed', 'TechCrunch']\n", | |
| "----\n", | |
| "['Gothamist', 'Techdirt.', 'kottke.org', 'Boing Boing', 'Joystiq', 'Download Squad', 'Topix Blog', 'Autoblog', 'Go Fug Yourself', 'Wired Top Stories', 'Gizmodo', \"Neil Gaiman's Journal\", \"Seth Godin's Blog on marketing, tribes and respect\"]\n", | |
| "----\n", | |
| "[\"The Superficial - Because You're Ugly\", 'TMZ.com', 'PerezHilton', 'Slashdot', 'The Full Feed from HuffingtonPost.com', 'Pharyngula', \"Steve Pavlina's Personal Development Blog\", 'WIL WHEATON dot NET', 'Daily Kos', 'Deadspin', 'Kotaku', 'Gawker']\n", | |
| "done\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 271 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 7. \uc120\ud638\ub3c4 \uad70\uc9d1\n", | |
| "\n", | |
| "\uc81c\ubcf4(http://www.zebo.com)\uc5d0\uc11c \uacc4\uc815\uc744 \ub9cc\ub4e4\uace0 \uac00\uc9c4 \ud639\uc740 \uac00\uc9c0\uace0 \uc2f6\uc740 \ubb3c\uac74\uc758 \ubaa9\ub85d\uc744 \ub9cc\ub4e4\uba74 \uc120\ud638\ub3c4\uc5d0 \ub530\ub978 \uad70\uc9d1\uc744 \ub9cc\ub4e4 \uc218 \uc788\ub2e4.\n", | |
| "(\ud604\uc7ac \uc0ac\uc774\ud2b8\ub294 \uc11c\ube44\uc2a4 \uc911\ub2e8 \uc0c1\ud0dc\uc778 \ub4ef.. \u3160\u3160)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Beautiful Soup\n", | |
| "\n", | |
| "\uc6f9\ud398\uc774\uc9c0\ub098 XML\uc744 \ud30c\uc2f1\ud558\uace0 \ud3b8\ub9ac\ud558\uac8c \ucc3e\uc544\ubcfc \uc218 \uc788\ub294 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc774\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import urllib2\n", | |
| "from BeautifulSoup import BeautifulSoup\n", | |
| "# c = urllib2.urlopen('http://kiwitobes.com/wiki/Programming_language.html') <-- \ud398\uc774\uc9c0 \uc0ac\ub77c\uc9d0\n", | |
| "c = urllib2.urlopen('http://kiwitobes.com')\n", | |
| "soup = BeautifulSoup(c.read())\n", | |
| "links = soup('a')\n", | |
| "print links[10]\n", | |
| "print ''\n", | |
| "print links[10]['href']" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "<a href=\"http://kiwitobes.files.wordpress.com/2013/09/img_20130919_224325.jpg\"><img class=\"alignnone size-medium wp-image-32\" alt=\"Twitter lights\" src=\"http://kiwitobes.files.wordpress.com/2013/09/img_20130919_224325.jpg?w=300&h=225\" width=\"300\" height=\"225\" /></a>\n", | |
| "\n", | |
| "http://kiwitobes.files.wordpress.com/2013/09/img_20130919_224325.jpg\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 249 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### \uc81c\ubcf4 \uacb0\uacfc \uc2a4\ud06c\ub798\ud551\n", | |
| "\n", | |
| "\ud56d\ubaa9 \ubaa9\ub85d\uc740 bgverdanasmall \ud074\ub798\uc2a4\ub97c \uac00\uc9c4\ub2e4. \uc5ec\uae30\uc5d0\uc11c \uc911\uc694 \ub370\uc774\ud130\ub97c \ucd94\ucd9c\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from BeautifulSoup import BeautifulSoup\n", | |
| "import urllib2\n", | |
| "import re\n", | |
| "\n", | |
| "chare = re.compile(r'[!-\\.&]')\n", | |
| "itemowners = {}\n", | |
| "\n", | |
| "# \uc0ad\uc81c\ud560 \ub2e8\uc5b4\ub4e4\n", | |
| "dropwords = ['a', 'new', 'some', 'more', 'my', 'own', 'the', 'many', 'other', 'another']\n", | |
| "\n", | |
| "currentuser = 0\n", | |
| "for i in range(1, 51):\n", | |
| " # \uc6d0\ud558\ub294 \uac80\uc0c9 \ud398\uc774\uc9c0 URL\n", | |
| " c = urllib2.urlopen(\n", | |
| " 'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d' % (i))\n", | |
| " soup = BeautifulSoup(c.read())\n", | |
| " for td in sout('td'):\n", | |
| " # bgverdanasmall \ud074\ub798\uc2a4\ub97c \uac00\uc9c4 \ud14c\uc774\ube14 \uc140 \ucc3e\uae30\n", | |
| " if ('class' in dict(td.attrs) and td['class'] == 'bgverdanasmall'):\n", | |
| " items = [re.sub(chare, '', a.contents[0].lower()).strip() for a in td('a')]\n", | |
| " for item in items:\n", | |
| " # \uc5ec\ubd84\uc758 \ub2e8\uc5b4\ub4e4 \uc81c\uac70\n", | |
| " txt = ' '.join([t for t in item.split(' ') if t not in dropwords])\n", | |
| " if len(txt) < 2:\n", | |
| " continue\n", | |
| " itemowners.setdefault(txt, {})\n", | |
| " itemowners[txt][currentuser] = 1\n", | |
| " currentuser += 1" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "URLError", | |
| "evalue": "<urlopen error [Errno 60] Operation timed out>", | |
| "output_type": "pyerr", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mURLError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-165-53e2b91463bf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;31m# \uc6d0\ud558\ub294 \uac80\uc0c9 \ud398\uc774\uc9c0 URL\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m c = urllib2.urlopen(\n\u001b[0;32m---> 15\u001b[0;31m 'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d' % (i))\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtd\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'td'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m//anaconda/python.app/Contents/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_opener\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0m_opener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m//anaconda/python.app/Contents/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 405\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0;31m# post-process response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m//anaconda/python.app/Contents/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 421\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[0;32m--> 422\u001b[0;31m '_open', req)\n\u001b[0m\u001b[1;32m 423\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m//anaconda/python.app/Contents/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 382\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 383\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m//anaconda/python.app/Contents/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36mhttp_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1213\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1214\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhttplib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[0mhttp_request\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m//anaconda/python.app/Contents/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req)\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# XXX what error?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1184\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1185\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mURLError\u001b[0m: <urlopen error [Errno 60] Operation timed out>" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 165 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "**\uadf8\ub7ec\ub098 \uc5f0\uacb0\ub418\uc9c0 \uc54a\uc74c! \uc544\ub798 \ub9c1\ud06c\uc5d0\uc11c \ub2e4\uc6b4 \ubc1b\uc74c \u3160\u3160..**\n", | |
| "\n", | |
| "https://raw.github.com/arthur-e/Programming-Collective-Intelligence/master/chapter3/zebo.txt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc704 \ucf54\ub4dc\ub294 \uc81c\ubcf4\uc758 \"want\" \uac80\uc0c9 \uacb0\uacfc \uc911 \uc0c1\uc704 50\ud398\uc774\uc9c0\ub97c \ub2e4\uc6b4\ub85c\ub4dc\ud558\uace0 \ud30c\uc2f1\ud55c\ub2e4. \ud615\uc2dd\uc5c6\ub294 \ud14d\uc2a4\ud2b8\uc5ec\uc11c \"a\"\ub098 \"some\"\uac19\uc740 \ubd88\uc6a9\uc5b4\uc640 \uad6c\ub450\uc810\uc744 \uc0ad\uc81c\ud558\uace0 \ubaa8\ub4e0 \uac83\uc744 \uc18c\ubb38\uc790\ud654\ud558\ub294 \uc791\uc5c5\uc744 \ud55c\ub2e4.\n", | |
| "\n", | |
| "\uc774 \uc791\uc5c5\uc774 \ub05d\ub098\uba74 5\uba85 \uc774\uc0c1\uc774 \uc6d0\ud558\ub294 \ud56d\ubaa9\ub4e4\uc758 \ubaa9\ub85d\uc744 \ub9cc\ub4e0\ub2e4. \uc775\uba85 \uc0ac\uc6a9\uc790\ub97c \uc138\ub85c\uc904\ub85c \ud558\uace0 \ud56d\ubaa9\uc744 \uac00\ub85c\uc904\ub85c \ud558\ub294 \ud589\ub82c\uc744 \ub9cc\ub4e4\uace0 \uc800\uc7a5\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# \uc8fc\uc758: \uc2e4\ud589\ud558\uc9c0 \ub9d0\uac83!\n", | |
| "out = file('zebo.txt', 'wt')\n", | |
| "out.write('Item')\n", | |
| "\n", | |
| "for user in range(0, currentuser):\n", | |
| " out.write('\\tU%d' % user)\n", | |
| "out.write('\\n')\n", | |
| "\n", | |
| "for item, owners in itemowners.items():\n", | |
| " if len(owners) > 10:\n", | |
| " out.write(item)\n", | |
| " for user in range(0, currentuser):\n", | |
| " if user in owners:\n", | |
| " out.write('\\t1')\n", | |
| " else:\n", | |
| " out.write('\\t0')\n", | |
| " out.write('\\n')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 272 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### \uac70\ub9ac \uc9c0\ud45c \uacb0\uc815\n", | |
| "\n", | |
| "\ud53c\uc5b4\uc2a8 \uc0c1\uad00 \uc9c0\ud45c\ub294 \ub2e8\uc5b4 \ucd9c\ud604 \ud69f\uc218\uc778 \ube14\ub85c\uadf8 \ub370\uc774\ud130\uc5d0\ub294 \uc801\ud569\ud588\uc9c0\ub9cc, \uc81c\ubcf4 \ub370\uc774\ud130\ub294 1\uacfc 0\ub9cc \uc788\uc5b4 **\ud0c0\ub2c8\ubaa8\ud1a0 \uacc4\uc218(Tanimoto coefficient)**\uac00 \ub354 \uc720\uc6a9\ud558\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "$$T = \\frac{N_c}{N_a + N_b - N_c}$$\n", | |
| "\n", | |
| "$$N_x: \uc9d1\ud569 X\uc5d0 \uc788\ub294 \uc694\uc18c \uc218$$\n", | |
| "\n", | |
| "A = {car, train, aircraft, ship}\n", | |
| "B = {car, motorcycle, train}\n", | |
| "\uc774\ub77c\uba74 \n", | |
| "\n", | |
| "\uad50\uc9d1\ud569 C = {car, train} \uc774\ub2e4.\n", | |
| "\uc774 \uacbd\uc6b0 \ud0c0\ub2c8\ubaa8\ud1a0 \uacc4\uc218\ub294 2/(4+3-2) = 0.4\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def tanimoto(v1, v2):\n", | |
| " c1, c2, shr = 0, 0, 0\n", | |
| " for i in range(len(v1)):\n", | |
| " if v1[i] != 0:\n", | |
| " c1 += 1 # v1\uc5d0 \uc788\ub294 \uacbd\uc6b0\n", | |
| " if v2[i] != 0:\n", | |
| " c2 += 1 # v2\uc5d0 \uc788\ub294 \uacbd\uc6b0 \n", | |
| " if v1[i] != 0 and v2[i] != 0:\n", | |
| " shr += 1 # \ub458 \ub2e4\uc5d0 \uc788\ub294 \uacbd\uc6b0\n", | |
| " return 1.0 - (float(shr) / (c1+c2-shr))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 276 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc704 \ud568\uc218\ub294 1.0\uacfc 0.0 \uc0ac\uc774\uc758 \uac12\uc744 \ub9ac\ud134\ud55c\ub2e4. 1.0\uac12\uc740 \uccab \ubc88\uc9f8 \ud56d\ubaa9\uc744 \uc6d0\ud558\ub294 \uc5b4\ub5a4 \uc0ac\ub78c\ub3c4 \ub450 \ubc88\uc9f8 \ud56d\ubaa9\uc744 \uc6d0\ud558\uc9c0 \uc54a\uc74c\uc744 \ub73b\ud558\uace0, 0.0\uc740 \ub450 \ud56d\ubaa9\uc744 \uc6d0\ud558\ub294 \uc0ac\ub78c\uc774 \ub3d9\uc77c\ud558\ub2e4\ub294 \ub73b\uc774\ub2e4. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### \uacb0\uacfc \uad70\uc9d1\ud654\n", | |
| "\n", | |
| "\uc804\uc758 \ub370\uc774\ud130\uc640 \ud3ec\ub9f7\uc774 \uac19\uae30\uc5d0 \ub2e4\uc74c\uacfc \uac19\uc774 \ud074\ub7ec\uc2a4\ud130\ub97c \ub9cc\ub4e0\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "wants, people, data = readfile('zebo.txt')\n", | |
| "clust = hcluster(data, distance=tanimoto)\n", | |
| "drawdendrogram(clust, wants)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 277 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc18c\uc720\ubb3c\uc758 \uad70\uc9d1\uc744 \uadf8\ub9b0 clusters.png \ud30c\uc77c\uc774 \ub9cc\ub4e4\uc5b4\uc9c4\ub2e4.\n", | |
| "\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 2\ucc28\uc6d0\uc73c\ub85c \ub370\uc774\ud130 \ubcf4\uae30\n", | |
| "\n", | |
| "\uac01 \ud56d\ubaa9\ub4e4\uc740 \ub450 \uac1c \uc774\uc0c1\uc758 \uc22b\uc790\ub97c \uac00\uc9c0\uae30 \ub54c\ubb38\uc5d0 \uc788\ub294 \uadf8\ub300\ub85c 2\ucc28\uc6d0\uc73c\ub85c \ud45c\uc2dc\ud560 \uc218 \uc5c6\ub2e4. \uadf8\ub7fc\uc5d0\ub3c4 \ube44\uc2b7\ud55c \ud56d\ubaa9\uc744 \uac00\uae4c\uc6b4 \uac70\ub9ac\uc5d0\uc11c \ubcf4\uae30\uc704\ud574 **\ub2e4\ucc28\uc6d0 \ube44\ub840 \ucd95\uc18c\ubc95(multidimensional scaling)**\uc774\ub780 \uae30\ubc95\uc744 \uc0ac\uc6a9\ud574 \ub370\uc774\ud130 \uc138\ud2b8\uc5d0 \ub300\ud55c 2\ucc28\uc6d0 \ud45c\ud604\uc744 \ucc3e\ub294\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\uc544\ub798\uc758 \ube14\ub85c\uadf8 \ub370\uc774\ud130 \uc138\ud2b8\uc5d0 \ub300\ud574 \uc124\uba85\ud558\uaca0\ub2e4.\n", | |
| "\n", | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ubaa8\ub4e0 \ud56d\ubaa9\uc744 \ub2e4\uc74c \uadf8\ub9bc\ucc98\ub7fc 2\ucc28\uc6d0 \ub3c4\ud45c\uc5d0 \uc784\uc758\ub85c \uc704\uce58\uc2dc\ud0a8\ub2e4.\n", | |
| "\n", | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ubaa8\ub4e0 \ud56d\ubaa9\uac04 \uac70\ub9ac\ub97c \uc2e4\uc81c\ub85c \uacc4\uc0b0\ud55c\ub2e4.\n", | |
| "\n", | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ubaa8\ub4e0 \ubaa9\ud45c \ud56d\ubaa9 \uc30d\uc5d0 \ub300\ud574 \ubaa9\ud45c \uac70\ub9ac\ub97c \ud604\uc7ac \uac70\ub9ac\uc5d0 \ube44\uad50\ud558\uace0 \uc624\ub958 \uac12\uc744 \uacc4\uc0b0\ud574 \uc880 \ub354 \uba40\uac70\ub098 \uac00\uae5d\uac8c \uc774\ub3d9\uc2dc\ud0a8\ub2e4.\n", | |
| "\n", | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ud56d\ubaa9\uc744 \uc6c0\uc9c1\uc5ec\ub3c4 \ub354 \uc774\uc0c1 \uc804\uccb4 \uc624\ub958 \uac12\uc774 \uc904\uc5b4\ub4e4\uc9c0 \uc54a\uc744 \ub54c\uae4c\uc9c0 \uc774 \uacfc\uc815\uc744 \ubc18\ubcf5 \uc218\ud589\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def scaledown(data, distance=pearson, rate=0.01):\n", | |
| " n = len(data)\n", | |
| " \n", | |
| " # \ubaa8\ub4e0 \ud56d\ubaa9 \uc30d \uac04\uc758 \uc2e4\uc81c \uac70\ub9ac\n", | |
| " realdist = [\n", | |
| " [distance(data[i], data[j]) for j in range(n)]\n", | |
| " for i in range(0, n)\n", | |
| " ]\n", | |
| "\n", | |
| " outersum = 0.0\n", | |
| " # 2D \ub0b4\uc5d0\uc11c \ubb34\uc791\uc704\ub85c \uc120\uc815\ub41c \uc704\uce58\uc5d0\uc11c \uc2dc\uc791\uc810\uc744 \ucd08\uae30\ud654\n", | |
| " loc = [[random.random(), random.random()] for i in range(n)]\n", | |
| " fakedist = [[0.0 for j in range(n)] for i in range(n)]\n", | |
| " \n", | |
| " lasterror = None\n", | |
| " for m in range(0, 1000):\n", | |
| " # \ud22c\uc601\ub41c \uac70\ub9ac\ub97c \uad6c\ud568\n", | |
| " for i in range(n):\n", | |
| " for j in range(n):\n", | |
| " fakedist[i][j] = sqrt(sum([pow(loc[i][x] - loc[j][x], 2)\n", | |
| " for x in range(len(loc[i]))]))\n", | |
| " \n", | |
| " # \uc810\uc744 \uc774\ub3d9\uc2dc\ud0b4 \n", | |
| " grad = [[0.0, 0.0] for i in range(n)]\n", | |
| " \n", | |
| " totalerror = 0\n", | |
| " for k in range(n):\n", | |
| " for j in range(n):\n", | |
| " if j == k:\n", | |
| " continue\n", | |
| " # \uc624\ub958\ub294 \uac70\ub9ac\uac04\uc758 \ucc28\uc774 \ube44\uc728\uc784\n", | |
| " errorterm = (fakedist[j][k] - realdist[j][k]) / realdist[j][k]\n", | |
| " \n", | |
| " # \uac01 \uc810\uc744 \uc624\ub958 \uc815\ub3c4\uc5d0 \ube44\ub840\ud574\uc11c \ub2e4\ub978 \uc810 \uadfc\ucc98\ub098 \uba40\ub9ac \uc774\ub3d9\uc2dc\ud0b4\n", | |
| " grad[k][0] += ((loc[k][0] - loc[j][0]) / fakedist[j][k]) * errorterm\n", | |
| " grad[k][1] += ((loc[k][1] - loc[j][1]) / fakedist[j][k]) * errorterm\n", | |
| " \n", | |
| " # \uc804\uccb4 \uc624\ub958\ub97c \uae30\ub85d\ud568\n", | |
| " totalerror += abs(errorterm)\n", | |
| " print totalerror\n", | |
| " \n", | |
| " # \uc810\ub4e4\uc744 \uc6c0\uc9c1\uc5ec \uc5bb\ub294 \uacb0\uacfc\uac00 \ub354 \ub098\uc058\uba74 \uc791\uc5c5\uc744 \ub9c8\uce68\n", | |
| " if lasterror and lasterror < totalerror:\n", | |
| " break\n", | |
| " lasterror = totalerror\n", | |
| " \n", | |
| " # \uac01 \uc810\ub4e4\uc744 \ud559\uc2b5 \ube44\uc728\uacfc \uae30\uc6b8\uae30\ub97c \uacf1\ud55c \ub9cc\ud07c \uc774\ub3d9\uc2dc\ud0b4\n", | |
| " for k in range(n):\n", | |
| " loc[k][0] -= rate * grad[k][0]\n", | |
| " loc[k][1] -= rate * grad[k][1]\n", | |
| " \n", | |
| " return loc" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 278 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "\ubaa8\ub4e0 \ud56d\ubaa9\ub4e4\uc758 \ub77c\ubca8\uc744 \uc0c8\ub85c\uc6b4 \uc88c\ud45c\ucd95\uc5d0 \ucd9c\ub825\ud55c\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def draw2d(data, labels, png='mds2d.png'):\n", | |
| " img = Image.new('RGB', (2000, 2000), (255, 255, 255))\n", | |
| " draw = ImageDraw.Draw(img)\n", | |
| " \n", | |
| " for i in range(len(data)):\n", | |
| " x = (data[i][0] + 0.5) * 1000\n", | |
| " y = (data[i][1] + 0.5) * 1000\n", | |
| " draw.text((x, y), labels[i], (0, 0, 0))\n", | |
| " img.save(png, 'PNG')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 279 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "blognames, words, data = readfile('blogdata.txt')\n", | |
| "coords = scaledown(data)\n", | |
| "draw2d(coords, blognames, png='blogs2d.png')\n", | |
| "print 'done'" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "2857.89503513\n", | |
| "2467.4194961\n", | |
| "2361.71948878" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2286.87684804\n", | |
| "2240.27524819" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2213.28747021\n", | |
| "2195.0436976" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2178.50761854\n", | |
| "2163.27530791\n", | |
| "2149.64276127" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2137.4891203\n", | |
| "2125.64380253" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2113.18223353\n", | |
| "2100.58451737" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2089.41330535\n", | |
| "2079.72847107" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2071.3988206\n", | |
| "2063.38371947" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2055.3060332\n", | |
| "2047.57593728" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2041.25437359\n", | |
| "2036.27966372" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2031.95862917\n", | |
| "2027.77447526" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2023.55818789\n", | |
| "2020.06484967\n", | |
| "2017.28163718" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2014.74638522\n", | |
| "2011.9784254" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2009.54384802\n", | |
| "2007.30085516" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2005.119494\n", | |
| "2002.87891735" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "2000.64803149\n", | |
| "1998.30455293" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1996.12050712\n", | |
| "1993.93267799" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1992.9942433\n", | |
| "1992.34255567" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1991.544295\n", | |
| "1990.55414056" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1989.65106322\n", | |
| "1988.92480247" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1988.44375919\n", | |
| "1987.81056395" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1987.08541738\n", | |
| "1986.50665456" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1986.01105077\n", | |
| "1985.47126539" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1984.89157972\n", | |
| "1984.21937445" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1983.4561439\n", | |
| "1982.60911239" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1981.89411159\n", | |
| "1981.15422374" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1980.34232238\n", | |
| "1979.42577947" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1978.56652532\n", | |
| "1977.66273986" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1976.77654083\n", | |
| "1975.84753466" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1974.95447014\n", | |
| "1974.0755233" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1973.16281054\n", | |
| "1972.32539928\n", | |
| "1971.48068698" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1970.69457281\n", | |
| "1969.88255191\n", | |
| "1969.16668652" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1968.46550507\n", | |
| "1967.83128821\n", | |
| "1967.22094156" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1966.5992298\n", | |
| "1965.99030906\n", | |
| "1965.43845845" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1964.85815937\n", | |
| "1964.31689552\n", | |
| "1963.84681173" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1963.37813983\n", | |
| "1962.9111184\n", | |
| "1962.49251582" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1962.08834192\n", | |
| "1961.67438201\n", | |
| "1961.23275404" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1960.78995279\n", | |
| "1960.35287024\n", | |
| "1959.82401791" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1959.26563613\n", | |
| "1958.71271735\n", | |
| "1958.21073843" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1957.81021165\n", | |
| "1957.3970146\n", | |
| "1957.00955049" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1956.58983878\n", | |
| "1956.14566151" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1955.6449922\n", | |
| "1955.12239848" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1954.5529858\n", | |
| "1953.96357159" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1953.38178076\n", | |
| "1952.77906619" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1952.21134154\n", | |
| "1951.61363964" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1951.08323215\n", | |
| "1950.61999134" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1950.21102485\n", | |
| "1949.85188292" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1949.46168152\n", | |
| "1949.13420183\n", | |
| "1948.88705294" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1948.66259403\n", | |
| "1948.45266707\n", | |
| "1948.34992603" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1948.24158753\n", | |
| "1948.1253183\n", | |
| "1947.946409" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1947.81262743\n", | |
| "1947.69033922\n", | |
| "1947.57600592" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1947.47176437\n", | |
| "1947.38761444" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1947.28116704\n", | |
| "1947.21896041" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1947.21333705\n", | |
| "1947.20047871" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1947.14425487\n", | |
| "1947.05273768" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1946.96501656\n", | |
| "1946.84983984" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1946.68562219\n", | |
| "1946.52436842" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1946.32869179\n", | |
| "1946.16206297" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1946.02792532\n", | |
| "1945.90720586\n", | |
| "1945.77516806" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1945.63787895\n", | |
| "1945.51656778\n", | |
| "1945.40440734" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1945.32310684\n", | |
| "1945.22839997" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1945.12422362\n", | |
| "1944.99527035" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1944.86840879\n", | |
| "1944.77437903" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1944.67802757\n", | |
| "1944.56532166" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1944.45131693\n", | |
| "1944.3108529\n", | |
| "1944.15104759" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1943.95421089\n", | |
| "1943.73340744\n", | |
| "1943.4775277" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1943.18166272\n", | |
| "1942.88538775" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1942.59372777\n", | |
| "1942.33796588" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1942.12981243\n", | |
| "1941.96789559" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1941.82553715\n", | |
| "1941.69447915" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1941.55852644\n", | |
| "1941.43677921" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1941.33530297\n", | |
| "1941.24521478" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1941.23275451\n", | |
| "1941.23260892" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1941.18982466\n", | |
| "1941.10966054" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1941.02623394\n", | |
| "1940.89798604\n", | |
| "1940.75832551" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1940.63159843\n", | |
| "1940.47096097" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1940.26151914\n", | |
| "1940.03081415\n", | |
| "1939.8337859" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1939.68841333\n", | |
| "1939.53581351\n", | |
| "1939.39539245" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1939.25287752\n", | |
| "1939.09603152" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1938.90155656\n", | |
| "1938.69030925" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1938.46353798\n", | |
| "1938.22972531" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1938.05285682\n", | |
| "1937.8679066\n", | |
| "1937.67229354" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1937.50157662\n", | |
| "1937.31817722\n", | |
| "1937.0933292" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1936.81827843\n", | |
| "1936.53362994\n", | |
| "1936.23324502" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1935.87530705\n", | |
| "1935.53460447\n", | |
| "1935.33978587" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1935.17308651\n", | |
| "1935.03504364\n", | |
| "1934.96677911" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.93960007\n", | |
| "1934.84181262" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.75443404\n", | |
| "1934.72474214\n", | |
| "1934.65842789" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.59656996\n", | |
| "1934.5111126" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.45064129\n", | |
| "1934.3784351" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.33480792\n", | |
| "1934.27533822" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.21483406\n", | |
| "1934.17326429" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1934.17642933\n", | |
| "done" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 280 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 9. \uad70\uc9d1 \uac00\ub2a5\ud55c \ub2e4\ub978 \uac83\ub4e4\n", | |
| "\n", | |
| "\ub2e4\uc591\ud55c \uc8fc\uc81c\uc5d0 \ub300\ud574 \uad70\uc9d1 \uac00\ub2a5\ud55c \uac83\ub4e4\uc740 \uc544\uc8fc \ub9ce\ub2e4. \n", | |
| "\n", | |
| "\ub2e4\ucc28\uc6d0 \ube44\uc728 \ucd95\uc18c\ubc95\uc740 \ub370\uc774\ud130 \uc138\ud2b8\ub97c \uc27d\uac8c \ud574\uc11d \uac00\ub2a5\ud55c \ubc29\uc2dd\uc73c\ub85c \uad00\ucc30\ud560 \uc218 \uc788\ub294 \ud6a8\uacfc\uc801\uc778 \ubc29\ubc95\uc774\ub098 \ucd95\uc18c \uacfc\uc815\uc5d0\uc11c \uc815\ubcf4\uac00 \uc190\uc2e4\ub420 \uc218\ub3c4 \uc788\ub2e4." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 10. \ud568\uaed8 \ud480\uc5b4\ubcf4\uae30\n", | |
| "\n", | |
| "### 4. \ub9e8\ud574\ud2bc \uac70\ub9ac:\n", | |
| "\n", | |
| "http://ko.wikipedia.org/wiki/\ub9e8\ud574\ud2bc_\uac70\ub9ac\n", | |
| "\n", | |
| "$$d_1: \uac70\ub9ac$$\n", | |
| "$$n \ucc28\uc6d0$$\n", | |
| "$$d_1(P, Q) = \\sum\\limits_{i=1}^n = |P_i - Q_i|$$" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def manhattan(v1, v2):\n", | |
| " d = 0\n", | |
| " for i in range(len(v1)):\n", | |
| " d += abs(v1[i] - v2[i])\n", | |
| " return 1.0 / (1 + d)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 193 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "wants, people, data = readfile('zebo.txt')\n", | |
| "clust = hcluster(data, distance=manhattan)\n", | |
| "drawdendrogram(clust, wants, png='dendro-manhattan.png')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 198 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "" | |
| ] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment