Created
October 7, 2014 12:15
-
-
Save d2207197/31c0fe2a2883a7605a38 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:68c70d4a91e4c81305739f17d93693072d5c58d7117de4a64b6f11c53e131359" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import division\n", | |
"\n", | |
"k0 = 1\n", | |
"k1 = 1\n", | |
"U0 = 10\n", | |
"max_distance = 5" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# \u8a08\u7b97 skip bigram \n", | |
"\n", | |
"- $p_j^i$ \n", | |
"- ngram \u4efb\u4f55\u4f4d\u7f6e\u4e0d\u5305\u542b\u7b26\u865f\n", | |
"- skip bigram \u4e0d\u5305\u542b stop words \u8207\u6578\u5b57" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### \u968e\u6bb5\u6aa2\u67e5\u8cc7\u6599" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"skip_bigram_info['role']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"BaseWord(avg_freq = 0, dev = 0, \n", | |
" { 'limited': Collocates(freq = 0, spread = 0, {3: 1}), \n", | |
" 'concept': Collocates(freq = 0, spread = 0, {-5: 1, -2: 1, 1: 1, 4: 1}), \n", | |
" 'semantic': Collocates(freq = 0, spread = 0, {-1: 1, 2: 1}), ...}\n", | |
")" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"skip_bigram_info['play']['role']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"Collocates(freq = 0, spread = 0, {-5: 1, -4: 2, -2: 2, 2: 8, 3: 51, 4: 5, 5: 1})" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# \u8a08\u7b97 skip bigram \u7684\u5404\u7a2e\u7d71\u8a08\u8cc7\u8a0a \n", | |
"$freq,\\ \\bar{f},\\ \\sigma,\\ u_i,\\ \\bar{p_i}$" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### \u968e\u6bb5\u6aa2\u67e5\u8cc7\u6599\n", | |
"- $\\bar{f}$ : `avg_freq`\n", | |
"- $\\sigma$ : `dev`\n", | |
"- $u_i$ : `spread`\n", | |
"- $freq$ : `freq`\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"skip_bigram_info['play']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 38, | |
"text": [ | |
"BaseWord(avg_freq = 1.52475247525, dev = 4.1971089989, \n", | |
" { 'xed': Collocates(freq = 1, spread = 0.09, {-5: 1}), \n", | |
" 'What': Collocates(freq = 1, spread = 0.09, {-5: 1}), \n", | |
" 'Interconnect': Collocates(freq = 1, spread = 0.09, {-2: 1}), ...}\n", | |
")" | |
] | |
} | |
], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"skip_bigram_info['play']['role']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 39, | |
"text": [ | |
"Collocates(freq = 70, spread = 221.0, {-5: 1, -4: 2, -2: 2, 2: 8, 3: 51, 4: 5, 5: 1})" | |
] | |
} | |
], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## \u4f7f\u7528 pandas \u756b\u51fa play role \u7684\u5404\u8ddd\u96e2\u6578\u91cf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas \n", | |
"%matplotlib inline\n", | |
"play_role_distances_count = pandas.Series(skip_bigram_info['play']['role'].values(), index= skip_bigram_info['play']['role'].keys()).sort_index()\n", | |
"play_role_distances_count.plot(kind='bar')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 42, | |
"text": [ | |
"<matplotlib.axes.AxesSubplot at 0x117f2ae10>" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "display_data", | |
"png": "iVBORw0KGgoAAAANSUhEUgAAAWwAAAD/CAYAAADVGuzgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEXVJREFUeJzt3W2MXGd5xvH/4k1EnHgzXoXaVgLZKlIElYANKi99UxYU\nUCqBa0GbL62UpSlfWlEqBLUjlbpSWzBWJayqLxKtyhqEWlxo0/hDW5swQ1ukGlJiQ0lSt062SqrY\nhtgmJiBBm+mH56z3ZNkdn5kzZ59z7/5/0mjnOT4zc+1m9t6Za85MQJIkSZIkSZIkSZIkSZIkBdMB\nPgs8BjwKvBGYBo4Dp4FjxT6SpMwOA79cnJ8EbgQOAr9ZbNsLHMiQS5JUciPwxCrbHwd2FOd3FmtJ\nUkazwAngE8BXgT8DrgculvaZWLGWJI3ZSyrsMwm8DviT4uvzwL4V+/SLkySpIZMV9nm6OH2lWH8W\nuB84S6pCzgK7gPMrL3jbbbf1z5w5M56kkrR5nCK1Gy9S5RH2WeAp4PZifRfwDeAocG+x7V7ggZUX\nPHPmDP1+v7HT/v37G73+pk+R80fObv78J/MPPgGvXW0YV3mEDfBe4NPAtcAZ4N3AFuAIcB+wCNxT\n8brGZnFxcb1vcqwi54+cHcyfm/lHU3VgnwJev8r2u8aYRZI0QJVKpLXm5+dzR6glcv7I2cH8uZl/\nNBMNX3+/6GMkSRVNTEzAKvM59CPsXq+XO0ItkfNHzg7mz838owk9sCVpM7ESkaSW2ZCViCRtJqEH\ntj1YPpGzg/lzM/9oQg9sSdpM7LAlqWXssCUpuNAD2x4sn8jZwfy5mX80oQe2JG0mdtiS1DJ22JIU\nXOiBbQ+WT+TsYP7czD+a0ANbkjYTO2xJahk7bEkKLvTAtgfLJ3J2MH9u5h9N6IEtSZuJHbYktYwd\ntiQFF3pg24PlEzk7mD83848m9MCWpM3EDluSWsYOW5KCCz2w7cHyiZwdzJ+b+UcTemBL0mZStcNe\nBJ4D/g/4AfAGYBr4DHBr8e/3AJdWXM4OW5KGVLfD7gNzwB2kYQ2wDzgO3A48VKwljWBqapqJiYnG\nTlNT07m/RY3BMJXIymm/GzhcnD8M7BlLoiHYg+UTOTu0L//lyxdJj4uqnrpD7Z+uvz3a9vMfVts7\n7D7weeBh4D3Fth3AueL8uWItSWpI1Q57F/AM8DJSDfJe4EFge2mfC6Reu8wOW6ogdZZN/q5M4O9i\nHGt12JMVL/9M8fWbwN+SeuxzwE7gLGmgn1/tgvPz88zMzADQ6XSYnZ1lbm4OWH5a4dq1a4Be8bWZ\nde7vz/Xa616vx8LCAsCVeTmqrcC24vz1wJeAtwEHgb3F9n3AgVUu229St9tt9PqbFjl/5Oz9fvvy\nA33oD3HqDrl/s7+Lw2rbz39YTednjadbVR5h7yA9ql7a/9PAMVKffQS4j+XD+iRJDfGzRKQWsMNW\nmZ8lIknBhR7YS6V9VJHzR84O8fMvv6AYU/Sff678oQe2JG0mdthSC9hhq8wOW5KCCz2w7cHyiZwd\n4ue3w87LDluSNJAdttQCdtgqs8OWpOBCD2x7sHwiZ4f4+e2w87LDliQNZIcttYAdtsrssCUpuNAD\n2x4sn8jZIX5+O+y87LAlSQPZYUstYIetMjtsSQou9MC2B8sncnaIn98OOy87bEnSQHbYUgvYYavM\nDluSggs9sO3B8omcHeLnt8POyw5bkjSQHbbUAnbYKrPDlqTgQg9se7B8ImeH+PntsPOyw5YkDWSH\nLbWAHbbK7LAlKbiqA3sL8AhwtFhPA8eB08AxoDP+aFdnD5ZP5OwQP78ddl5t77DfBzzK8nO2faSB\nfTvwULGWJDWoSod9C7AA/D7wfuAdwOPAncA5YCfpz/0rV7msHbZUgR22yup02B8DPgi8UNq2gzSs\nKb7uqJlPknQVk1f597cD50n99dwa+/QZ8NBgfn6emZkZADqdDrOzs8zNpata6oFGXR86dGis17fe\n68j5yx1eG/JshPzLvXSV9XL+avvn//7a/vPPmb/X67GwsABwZV6O4sPAU8CTwDPA88CnSJXIzmKf\nXcV6Nf0mdbvdRq+/aZHzR87e77cvP9CH/hCn7pD7N/u7OKy2/fyH1XR+1ngQPMxx2HcCHyB12AeB\nZ4GPkl5w7LD6C4/FbUsaxA5bZeM6Dnvpv/gB4K2kw/reUqwlSQ0aZmB/EdhdnL8A3EU6rO9twKUx\n56qk3CNFFDl/5OwQP7/HYeeVK7/vdJSkIPwsEakF7LBV5meJSFJwoQe2PVg+kbND/Px22HnZYUuS\nBrLDllrADltldtiSFFzogW0Plk/k7BA/vx12XnbYkqSB7LClFrDDVpkdtiQFF3pg24PlEzk7xM9v\nh52XHbYkaSA7bKkF7LBVZoctScGFHtj2YPlEzg7x89th52WHLUkayA5bagE7bJXZYUtScKEHtj1Y\nPpGzQ/z8dth52WFLkgayw5ZawA5bZXbYkhRc6IFtD5ZP5OwQP78ddl522JKkgeywpRaww1aZHbYk\nBRd6YNuD5RM5O8TPb4edV1s77JcCJ4CTwKPAR4rt08Bx4DRwDOg0FVCSlFTpsLcC3wUmgX8BPgDs\nBr4FHAT2AtuBfatc1g5bqsAOW2V1OuzvFl+vBbYAF0kD+3Cx/TCwp35ESdIgVQb2S0iVyDmgC3wD\n2FGsKb7uaCTdVdiD5RM5O8TPb4edV678kxX2eQGYBW4E/hF484p/7zPgudz8/DwzMzMAdDodZmdn\nmZubA5a/6VHXJ0+erHX53Ovo+V2Pd708hJtZ5/7+XK+97vV6LCwsAFyZl6sZ9jjsDwHfA36FdC84\nC+wiPfJ+5Sr722FLFdhhq2zUDvsmlo8AuQ54K/AI8CBwb7H9XuCBsaSUJK3pagN7F/AFUod9AjgK\nPAQcIA3v08BbivW6W3pKEVXk/JGzQ/z8dth55cp/tQ7768DrVtl+Abhr/HEkSWvxs0SkFrDDVpmf\nJSJJwYUe2PZg+UTODvHz22HnlSt/6IEtSZuJHbbUAnbYKrPDlqTgQg9se7B8ImeH+PntsPOyw5Yk\nDWSHLbWAHbbK7LAlKbjQA9seLJ/I2SF+fjvsvOywJUkD2WFLLWCHrTI7bEkKLvTAtgfLJ3J2iJ/f\nDjsvO2xJ0kB22FIL2GGrzA5bkoILPbDtwfKJnB3i57fDzssOW5I0kB221AJ22Cqzw5ak4EIPbHuw\nfCJnh/j57bDzssOWJA1khy21gB22yuywJSm40APbHiyfyNkhfn477LzssCVJA1XpsF8OfBL4EVLJ\n9nHgD4Fp4DPArcAicA9wacVl7bClCuywVbZWh11lYO8sTieBG4B/A/YA7wa+BRwE9gLbgX0rLuvA\nlipwYKuszouOZ0nDGuA7wGPAzcBu4HCx/TBpiK8re7B8ImeH+PntsPOK0mHPAHcAJ4AdwLli+7li\nLUlqyDDHYd8AfBH4XeAB4CKpBllygdRrl1mJSBVYiahsrUpksuLlrwE+B3yKNKwhPareSapMdgHn\nV7vg/Pw8MzMzAHQ6HWZnZ5mbmwOWn1a4du0almuOZta5vz/Xa697vR4LCwsAV+blqCZIR4l8bMX2\npRcbIb3YeGCVy/ab1O12G73+pkXOHzl7v9++/EAf+kOcukPu3+zv4rDa9vMfVtP5WePpVpVH2D8F\n/BLwNeCRYtv9xYA+AtzH8mF9kqSG+FkiUgvYYavMzxKRpOBCD+yl0j6qyPkjZ4f4+T0OO69c+UMP\nbEnaTOywpRaww1aZHbYkBRd6YNuD5RM5O8TPb4edlx22JGkgO2ypBeywVWaHLUnBhR7Y9mD5RM4O\n8fPbYedlhy1JGsgOW2oBO2yV2WFLUnChB7Y9WD6Rs0P8/HbYedlhS5IGssOWWsAOW2V22JIUXOiB\nbQ+WT+TsED+/HXZedtiSpIHssKUWsMNWmR22JAUXemDbg+UTOTvEz2+HnZcdtiRpIDtsqQXssFVm\nhy1JwYUe2PZg+UTODvHz22HnZYctSRrIDltqATtsldXpsP8COAd8vbRtGjgOnAaOAZ36ESVJg1QZ\n2J8A7l6xbR9pYN8OPFSs1509WD6Rs0P8/HbYebW5w/5n4OKKbbuBw8X5w8CecYaSJP2wqh32DHAU\neHWxvghsL13HhdK6zA5bqsAOW2VNHofdp9l7miQJmBzxcueAncBZYBdwfq0d5+fnmZmZAaDT6TA7\nO8vc3Byw3AONuj506NBYr2+915Hzlzu8NuTZCPmXe+kq6+X81fbP//21/eefM3+v12NhYQHgyrys\nY4YXHyVyENhbnN8HHFjjcv0mdbvdRq+/aZHzR87e77cvP9CH/hCn7pD7N/u7OKy2/fyH1XR+1mgt\nqnTYfwncCdxEemT928DfAUeAVwCLwD3ApTUGdoWbkDY3O2yVrdVh+8YZqQUc2CrbkB/+VO6RIoqc\nP3J2iJ/f47DzypU/9MCWpM3ESkRqASsRlW3ISkRaMjU1zcTERGOnqanp3N+iFHtg24Pl07bsly9f\nZPk9XFVO3aH2T9ffJr3cAWpp2/1nWHbYkqSB7LC1IUTvgKPn13jZYUtScKEHtj1YPpGzJ73cAWrq\n5Q5QS/T7jx22JGkgO2xtCNE74Oj5NV522JIUXOiBbQ+WT+TsSS93gJp6uQPUEv3+Y4ctSRrIDlsb\nQvQOOHr+qanpRt8Num3bdp577kJj1982fh62NrToA8/8V72FTfWi6YZ80dEeLJ/I2ZNe7gA19XIH\nqKmXO0AtdtiSpIGsRLQhRH9Kbv6r3oKVCD7ClqQwQg/s6D1q5PyRsye93AFq6uUOUFMvd4Ba7LAl\nSQPZYWtDiN6hmv+qt2CHjY+wJSmM0AM7eo8aOX/k7Ekvd4CaerkD1NTLHaCWXPf/ySy3ukE1+fbc\npt+a61uLpfazwx6jZns8O8iB127+q92C+QOxw5ak4OoO7LuBx4H/BPbWjzMce9ScerkD1NTLHaCm\nXu4ANfVyB6glYoe9Bfgj4C7gf4CvAA8Cj416hZuvRz0JzOUOMaLI2cH8ubUrf5TZU+cR9huA/wIW\ngR8AfwX8XJ0w6QfWH+K0f6j9m/wPMppLuQPUEDk7mD+3duWPMnvqDOybgadK66eLbZKkBtQZ2C14\nyXYxd4CaFnMHqGExd4CaFnMHqGkxd4CaFnMHqGkxy63WOazvTcDvkF54BLgfeAH4aGmfk8Bra9yG\nJG1Gp4DZcV7hJHAGmAGuJQ3nV43zBiRJ4/OzwH+QXny8P3MWSZIkKb8tuQNI6+BVpNdSvgl8v7T9\nbtKzw7b7aeAGUv454OeB64AnM2barH4G+AVgG6kSVkXTuQPUtDt3gE3i10m13QPAfwN7Sv/2SJZE\nw/kI8K+kN6YdLM5/CPgn4IMZc9XxydwBhvDl0vn3kF6r2w98CWvgNf1W6fyPAadJjy4WSUertN07\ngXcVp6Xz54rz78yYq4rXkIbE08DHge2lf/vyqpdol38nPTqF9AL5w8BvFOsIA/tR0gv8W4HLwI3F\n9uuAr+UKNYSjpHdAHy2dni9tb7vyfeRh4GXF+etJ9611FeXjVd8F/F5x/g+A9wF/T3q35SHgJzPl\nquoI8A+kp7SQDqfcCryjWP9NjlAV/Snp8M0TwH2kRxa7SVXCNfliVTYBfKc4v0iqFD4H3Erzn1Y5\nDt8H/rc4nQG+XWz/Hukw2ra7hfRH589JeSeAHyf9HkewhfRsfqI4v/Q7/Dzpv4lWUf4rd2rFv51c\nzyAjej3wBeBXWR4SUfrHlY/i3kwa1m8ixiPULj98POs1pKflEQbeCdIfd3jxG906wFfXP87QtgDv\nBz4P3FFsi3Lfh/RH/sni9ASwq9i+jRizJ4tvs/y06lmW78ATZHhaMqItpKfiXeCNxLnTnmL5afiS\n15CG9rPrH2doLwd2rrJ9gvRiXtu9dI3tNwGvXs8gNd0C/DXwx7z4Iy2i2gr8aO4QbTVXOt1J+uu2\nE9gB/FquUCO6mXTHfSJ3kIp+EfiJFdt2Aq8gPc2VhvF24MO5Q2j9RXg6OEiEOmEt0X/2UkiR/48z\nEV4w2qj82UsZRH7jzBbSsalRRc4fObskSZIkSZIkSZIkSZIktdj/A5VzOUfc7nkxAAAAAElFTkSu\nQmCC\n", | |
"text": [ | |
"<matplotlib.figure.Figure at 0x117f22790>" | |
] | |
} | |
], | |
"prompt_number": 42 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# \u4f9d\u64da Smadja\u2019s \u4e09\u500b\u898f\u5247\u7be9\u9078 skip bigrams\n", | |
"\n", | |
"$$\\begin{cases} \n", | |
"strength = \\frac{freq - \\bar{f}}{\\sigma} \\ge k_0 & (C_1)\\\\\n", | |
"spread \\ge u_0 & (C_2) \\\\\n", | |
"p_j^i \\ge \\bar{p_i} + (k_1 \\times \\sqrt{u_i}) & (C_3) \n", | |
"\\end{cases}$$" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## \u4f7f\u7528 pandas Dataframe \u6574\u7406\u8207\u986f\u793a\u8cc7\u6599" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### \u5217\u51fa collocation Dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"collocations_df" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>strength</th>\n", | |
" <th>spread</th>\n", | |
" <th>peak</th>\n", | |
" <th>p</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>base word</th>\n", | |
" <th>collocate</th>\n", | |
" <th>distance</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"30\" valign=\"top\">#</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">#</th>\n", | |
" <th>-2</th>\n", | |
" <td> 37.990803</td>\n", | |
" <td> 506.56</td>\n", | |
" <td> 87.706888</td>\n", | |
" <td> 107</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> 2</th>\n", | |
" <td> 37.990803</td>\n", | |
" <td> 506.56</td>\n", | |
" <td> 87.706888</td>\n", | |
" <td> 107</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">The</th>\n", | |
" <th>-3</th>\n", | |
" <td> 2.035968</td>\n", | |
" <td> 13.16</td>\n", | |
" <td> 7.427671</td>\n", | |
" <td> 10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>-2</th>\n", | |
" <td> 2.035968</td>\n", | |
" <td> 13.16</td>\n", | |
" <td> 7.427671</td>\n", | |
" <td> 8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">We</th>\n", | |
" <th>-5</th>\n", | |
" <td> 1.333268</td>\n", | |
" <td> 20.04</td>\n", | |
" <td> 7.076606</td>\n", | |
" <td> 8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>-4</th>\n", | |
" <td> 1.333268</td>\n", | |
" <td> 20.04</td>\n", | |
" <td> 7.076606</td>\n", | |
" <td> 14</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>c</th>\n", | |
" <th> 1</th>\n", | |
" <td> 2.153085</td>\n", | |
" <td> 114.20</td>\n", | |
" <td> 14.686440</td>\n", | |
" <td> 36</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ciency</th>\n", | |
" <th> 1</th>\n", | |
" <td> 2.738668</td>\n", | |
" <td> 178.60</td>\n", | |
" <td> 18.364131</td>\n", | |
" <td> 45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>cient</th>\n", | |
" <th> 1</th>\n", | |
" <td> 8.828738</td>\n", | |
" <td> 2043.44</td>\n", | |
" <td> 60.604425</td>\n", | |
" <td> 151</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ciently</th>\n", | |
" <th> 1</th>\n", | |
" <td> 2.153085</td>\n", | |
" <td> 128.60</td>\n", | |
" <td> 15.340194</td>\n", | |
" <td> 38</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>coe</th>\n", | |
" <th>-1</th>\n", | |
" <td> 1.216151</td>\n", | |
" <td> 43.04</td>\n", | |
" <td> 8.960488</td>\n", | |
" <td> 22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>cult</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.860293</td>\n", | |
" <td> 103.45</td>\n", | |
" <td> 13.671037</td>\n", | |
" <td> 34</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>di</th>\n", | |
" <th>-1</th>\n", | |
" <td> 20.716086</td>\n", | |
" <td> 9893.61</td>\n", | |
" <td> 135.166628</td>\n", | |
" <td> 334</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>e</th>\n", | |
" <th>-1</th>\n", | |
" <td> 24.053912</td>\n", | |
" <td> 11877.84</td>\n", | |
" <td> 150.385504</td>\n", | |
" <td> 368</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ect</th>\n", | |
" <th> 1</th>\n", | |
" <td> 2.035968</td>\n", | |
" <td> 115.36</td>\n", | |
" <td> 14.540577</td>\n", | |
" <td> 36</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ective</th>\n", | |
" <th> 1</th>\n", | |
" <td> 3.148577</td>\n", | |
" <td> 270.21</td>\n", | |
" <td> 22.138066</td>\n", | |
" <td> 55</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ectively</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.157593</td>\n", | |
" <td> 47.61</td>\n", | |
" <td> 9.200000</td>\n", | |
" <td> 23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ects</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.977410</td>\n", | |
" <td> 109.21</td>\n", | |
" <td> 14.150359</td>\n", | |
" <td> 35</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>er</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.860293</td>\n", | |
" <td> 96.85</td>\n", | |
" <td> 13.341240</td>\n", | |
" <td> 33</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>erent</th>\n", | |
" <th> 1</th>\n", | |
" <td> 12.459357</td>\n", | |
" <td> 3617.24</td>\n", | |
" <td> 81.743495</td>\n", | |
" <td> 202</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>erential</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.450384</td>\n", | |
" <td> 59.96</td>\n", | |
" <td> 10.543384</td>\n", | |
" <td> 26</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ers</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.450384</td>\n", | |
" <td> 59.96</td>\n", | |
" <td> 10.543384</td>\n", | |
" <td> 26</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">n</th>\n", | |
" <th>-1</th>\n", | |
" <td> 2.504435</td>\n", | |
" <td> 24.44</td>\n", | |
" <td> 9.543683</td>\n", | |
" <td> 16</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> 1</th>\n", | |
" <td> 2.504435</td>\n", | |
" <td> 24.44</td>\n", | |
" <td> 9.543683</td>\n", | |
" <td> 12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>o</th>\n", | |
" <th>-1</th>\n", | |
" <td> 4.026952</td>\n", | |
" <td> 384.56</td>\n", | |
" <td> 26.810201</td>\n", | |
" <td> 66</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>su</th>\n", | |
" <th>-1</th>\n", | |
" <td> 2.855785</td>\n", | |
" <td> 223.16</td>\n", | |
" <td> 20.138541</td>\n", | |
" <td> 50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>tra</th>\n", | |
" <th>-1</th>\n", | |
" <td> 1.743176</td>\n", | |
" <td> 85.61</td>\n", | |
" <td> 12.552567</td>\n", | |
" <td> 31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">two</th>\n", | |
" <th>-2</th>\n", | |
" <td> 1.801735</td>\n", | |
" <td> 15.84</td>\n", | |
" <td> 7.379950</td>\n", | |
" <td> 13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> 3</th>\n", | |
" <td> 1.801735</td>\n", | |
" <td> 15.84</td>\n", | |
" <td> 7.379950</td>\n", | |
" <td> 8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>usion</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.099034</td>\n", | |
" <td> 43.56</td>\n", | |
" <td> 8.800000</td>\n", | |
" <td> 22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <th>...</th>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"13\" valign=\"top\">work</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">done</th>\n", | |
" <th> 1</th>\n", | |
" <td> 4.073897</td>\n", | |
" <td> 15.44</td>\n", | |
" <td> 6.529377</td>\n", | |
" <td> 8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> 3</th>\n", | |
" <td> 4.073897</td>\n", | |
" <td> 15.44</td>\n", | |
" <td> 6.529377</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>earlier</th>\n", | |
" <th>-1</th>\n", | |
" <td> 2.350643</td>\n", | |
" <td> 17.24</td>\n", | |
" <td> 5.752108</td>\n", | |
" <td> 14</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>future</th>\n", | |
" <th>-1</th>\n", | |
" <td> 4.246222</td>\n", | |
" <td> 41.81</td>\n", | |
" <td> 9.166065</td>\n", | |
" <td> 22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>main</th>\n", | |
" <th>-4</th>\n", | |
" <td> 2.005992</td>\n", | |
" <td> 12.64</td>\n", | |
" <td> 4.955278</td>\n", | |
" <td> 12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">paper</th>\n", | |
" <th>-4</th>\n", | |
" <td> 6.486453</td>\n", | |
" <td> 28.20</td>\n", | |
" <td> 9.310367</td>\n", | |
" <td> 13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>-3</th>\n", | |
" <td> 6.486453</td>\n", | |
" <td> 28.20</td>\n", | |
" <td> 9.310367</td>\n", | |
" <td> 15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">present</th>\n", | |
" <th>-2</th>\n", | |
" <td> 4.418548</td>\n", | |
" <td> 11.56</td>\n", | |
" <td> 6.200000</td>\n", | |
" <td> 7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>-1</th>\n", | |
" <td> 4.418548</td>\n", | |
" <td> 11.56</td>\n", | |
" <td> 6.200000</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>previous</th>\n", | |
" <th>-1</th>\n", | |
" <td> 13.551795</td>\n", | |
" <td> 469.09</td>\n", | |
" <td> 29.758486</td>\n", | |
" <td> 73</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>progress</th>\n", | |
" <th> 2</th>\n", | |
" <td> 2.005992</td>\n", | |
" <td> 12.64</td>\n", | |
" <td> 4.955278</td>\n", | |
" <td> 12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>recent</th>\n", | |
" <th>-1</th>\n", | |
" <td> 4.246222</td>\n", | |
" <td> 60.41</td>\n", | |
" <td> 10.472387</td>\n", | |
" <td> 26</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>well</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.833666</td>\n", | |
" <td> 10.61</td>\n", | |
" <td> 4.557299</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>works</th>\n", | |
" <th>previous</th>\n", | |
" <th>-1</th>\n", | |
" <td> 9.569996</td>\n", | |
" <td> 14.84</td>\n", | |
" <td> 5.452272</td>\n", | |
" <td> 13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">world</th>\n", | |
" <th>data</th>\n", | |
" <th> 1</th>\n", | |
" <td> 5.345151</td>\n", | |
" <td> 12.21</td>\n", | |
" <td> 5.194281</td>\n", | |
" <td> 12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>real</th>\n", | |
" <th>-1</th>\n", | |
" <td> 22.807470</td>\n", | |
" <td> 390.81</td>\n", | |
" <td> 26.468915</td>\n", | |
" <td> 66</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>virtual</th>\n", | |
" <th>-1</th>\n", | |
" <td> 3.948165</td>\n", | |
" <td> 10.61</td>\n", | |
" <td> 4.557299</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>worst</th>\n", | |
" <th>case</th>\n", | |
" <th> 1</th>\n", | |
" <td> 11.213565</td>\n", | |
" <td> 43.21</td>\n", | |
" <td> 8.873431</td>\n", | |
" <td> 22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">would</th>\n", | |
" <th>allow</th>\n", | |
" <th> 1</th>\n", | |
" <td> 6.998167</td>\n", | |
" <td> 10.76</td>\n", | |
" <td> 4.480244</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>like</th>\n", | |
" <th> 1</th>\n", | |
" <td> 14.299831</td>\n", | |
" <td> 31.81</td>\n", | |
" <td> 7.940035</td>\n", | |
" <td> 19</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">x</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">#</th>\n", | |
" <th>-1</th>\n", | |
" <td> 10.066947</td>\n", | |
" <td> 52.41</td>\n", | |
" <td> 11.939475</td>\n", | |
" <td> 12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> 1</th>\n", | |
" <td> 10.066947</td>\n", | |
" <td> 52.41</td>\n", | |
" <td> 11.939475</td>\n", | |
" <td> 24</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"6\" valign=\"top\">years</th>\n", | |
" <th>In</th>\n", | |
" <th>-2</th>\n", | |
" <td> 8.225192</td>\n", | |
" <td> 84.60</td>\n", | |
" <td> 13.197826</td>\n", | |
" <td> 31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ago</th>\n", | |
" <th> 1</th>\n", | |
" <td> 1.962656</td>\n", | |
" <td> 10.89</td>\n", | |
" <td> 4.400000</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>last</th>\n", | |
" <th>-2</th>\n", | |
" <td> 8.009242</td>\n", | |
" <td> 73.29</td>\n", | |
" <td> 12.460958</td>\n", | |
" <td> 28</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>many</th>\n", | |
" <th>-1</th>\n", | |
" <td> 3.042404</td>\n", | |
" <td> 20.04</td>\n", | |
" <td> 6.076606</td>\n", | |
" <td> 15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>past</th>\n", | |
" <th>-2</th>\n", | |
" <td> 8.441141</td>\n", | |
" <td> 107.49</td>\n", | |
" <td> 14.467738</td>\n", | |
" <td> 35</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>recent</th>\n", | |
" <th>-1</th>\n", | |
" <td> 13.623929</td>\n", | |
" <td> 380.25</td>\n", | |
" <td> 26.000000</td>\n", | |
" <td> 65</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>\ufffd\ufffd</th>\n", | |
" <th>\ufffd</th>\n", | |
" <th>-1</th>\n", | |
" <td> 6.606903</td>\n", | |
" <td> 22.24</td>\n", | |
" <td> 7.115930</td>\n", | |
" <td> 16</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>\ufffd</th>\n", | |
" <th>\ufffd\ufffd</th>\n", | |
" <th> 1</th>\n", | |
" <td> 6.724206</td>\n", | |
" <td> 22.24</td>\n", | |
" <td> 7.115930</td>\n", | |
" <td> 16</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5580 rows \u00d7 4 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 45, | |
"text": [ | |
" strength spread peak p\n", | |
"base word collocate distance \n", | |
"# # -2 37.990803 506.56 87.706888 107\n", | |
" 2 37.990803 506.56 87.706888 107\n", | |
" The -3 2.035968 13.16 7.427671 10\n", | |
" -2 2.035968 13.16 7.427671 8\n", | |
" We -5 1.333268 20.04 7.076606 8\n", | |
" -4 1.333268 20.04 7.076606 14\n", | |
" c 1 2.153085 114.20 14.686440 36\n", | |
" ciency 1 2.738668 178.60 18.364131 45\n", | |
" cient 1 8.828738 2043.44 60.604425 151\n", | |
" ciently 1 2.153085 128.60 15.340194 38\n", | |
" coe -1 1.216151 43.04 8.960488 22\n", | |
" cult 1 1.860293 103.45 13.671037 34\n", | |
" di -1 20.716086 9893.61 135.166628 334\n", | |
" e -1 24.053912 11877.84 150.385504 368\n", | |
" ect 1 2.035968 115.36 14.540577 36\n", | |
" ective 1 3.148577 270.21 22.138066 55\n", | |
" ectively 1 1.157593 47.61 9.200000 23\n", | |
" ects 1 1.977410 109.21 14.150359 35\n", | |
" er 1 1.860293 96.85 13.341240 33\n", | |
" erent 1 12.459357 3617.24 81.743495 202\n", | |
" erential 1 1.450384 59.96 10.543384 26\n", | |
" ers 1 1.450384 59.96 10.543384 26\n", | |
" n -1 2.504435 24.44 9.543683 16\n", | |
" 1 2.504435 24.44 9.543683 12\n", | |
" o -1 4.026952 384.56 26.810201 66\n", | |
" su -1 2.855785 223.16 20.138541 50\n", | |
" tra -1 1.743176 85.61 12.552567 31\n", | |
" two -2 1.801735 15.84 7.379950 13\n", | |
" 3 1.801735 15.84 7.379950 8\n", | |
" usion 1 1.099034 43.56 8.800000 22\n", | |
"... ... ... ... ...\n", | |
"work done 1 4.073897 15.44 6.529377 8\n", | |
" 3 4.073897 15.44 6.529377 11\n", | |
" earlier -1 2.350643 17.24 5.752108 14\n", | |
" future -1 4.246222 41.81 9.166065 22\n", | |
" main -4 2.005992 12.64 4.955278 12\n", | |
" paper -4 6.486453 28.20 9.310367 13\n", | |
" -3 6.486453 28.20 9.310367 15\n", | |
" present -2 4.418548 11.56 6.200000 7\n", | |
" -1 4.418548 11.56 6.200000 11\n", | |
" previous -1 13.551795 469.09 29.758486 73\n", | |
" progress 2 2.005992 12.64 4.955278 12\n", | |
" recent -1 4.246222 60.41 10.472387 26\n", | |
" well 1 1.833666 10.61 4.557299 11\n", | |
"works previous -1 9.569996 14.84 5.452272 13\n", | |
"world data 1 5.345151 12.21 5.194281 12\n", | |
" real -1 22.807470 390.81 26.468915 66\n", | |
" virtual -1 3.948165 10.61 4.557299 11\n", | |
"worst case 1 11.213565 43.21 8.873431 22\n", | |
"would allow 1 6.998167 10.76 4.480244 11\n", | |
" like 1 14.299831 31.81 7.940035 19\n", | |
"x # -1 10.066947 52.41 11.939475 12\n", | |
" 1 10.066947 52.41 11.939475 24\n", | |
"years In -2 8.225192 84.60 13.197826 31\n", | |
" ago 1 1.962656 10.89 4.400000 11\n", | |
" last -2 8.009242 73.29 12.460958 28\n", | |
" many -1 3.042404 20.04 6.076606 15\n", | |
" past -2 8.441141 107.49 14.467738 35\n", | |
" recent -1 13.623929 380.25 26.000000 65\n", | |
"\ufffd\ufffd \ufffd -1 6.606903 22.24 7.115930 16\n", | |
"\ufffd \ufffd\ufffd 1 6.724206 22.24 7.115930 16\n", | |
"\n", | |
"[5580 rows x 4 columns]" | |
] | |
} | |
], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### \u4f9d\u64da strength \u6392\u5e8f collocation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"collocations_df.sort(columns = 'strength', ascending=False)[:10]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>strength</th>\n", | |
" <th>spread</th>\n", | |
" <th>peak</th>\n", | |
" <th>p</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>base word</th>\n", | |
" <th>collocate</th>\n", | |
" <th>distance</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>This</th>\n", | |
" <th>paper</th>\n", | |
" <th> 1</th>\n", | |
" <td> 48.541987</td>\n", | |
" <td> 342586.69</td>\n", | |
" <td> 782.409055</td>\n", | |
" <td> 1953</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>also</th>\n", | |
" <th>We</th>\n", | |
" <th>-1</th>\n", | |
" <td> 43.152065</td>\n", | |
" <td> 13829.36</td>\n", | |
" <td> 159.398299</td>\n", | |
" <td> 394</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>In</th>\n", | |
" <th>paper</th>\n", | |
" <th> 2</th>\n", | |
" <td> 39.973864</td>\n", | |
" <td> 288595.41</td>\n", | |
" <td> 720.910769</td>\n", | |
" <td> 1795</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">#</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">#</th>\n", | |
" <th>-2</th>\n", | |
" <td> 37.990803</td>\n", | |
" <td> 506.56</td>\n", | |
" <td> 87.706888</td>\n", | |
" <td> 107</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> 2</th>\n", | |
" <td> 37.990803</td>\n", | |
" <td> 506.56</td>\n", | |
" <td> 87.706888</td>\n", | |
" <td> 107</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>present</th>\n", | |
" <th>We</th>\n", | |
" <th>-1</th>\n", | |
" <td> 37.248306</td>\n", | |
" <td> 43512.69</td>\n", | |
" <td> 284.696956</td>\n", | |
" <td> 700</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>order</th>\n", | |
" <th>In</th>\n", | |
" <th>-1</th>\n", | |
" <td> 36.934238</td>\n", | |
" <td> 3010.44</td>\n", | |
" <td> 73.267477</td>\n", | |
" <td> 183</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>show</th>\n", | |
" <th>We</th>\n", | |
" <th>-1</th>\n", | |
" <td> 36.319055</td>\n", | |
" <td> 13484.29</td>\n", | |
" <td> 167.221876</td>\n", | |
" <td> 388</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>paper</th>\n", | |
" <th>This</th>\n", | |
" <th>-1</th>\n", | |
" <td> 35.956122</td>\n", | |
" <td> 342586.69</td>\n", | |
" <td> 782.409055</td>\n", | |
" <td> 1953</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>The</th>\n", | |
" <th>paper</th>\n", | |
" <th> 1</th>\n", | |
" <td> 35.098383</td>\n", | |
" <td> 7977.44</td>\n", | |
" <td> 138.716516</td>\n", | |
" <td> 295</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 46, | |
"text": [ | |
" strength spread peak p\n", | |
"base word collocate distance \n", | |
"This paper 1 48.541987 342586.69 782.409055 1953\n", | |
"also We -1 43.152065 13829.36 159.398299 394\n", | |
"In paper 2 39.973864 288595.41 720.910769 1795\n", | |
"# # -2 37.990803 506.56 87.706888 107\n", | |
" 2 37.990803 506.56 87.706888 107\n", | |
"present We -1 37.248306 43512.69 284.696956 700\n", | |
"order In -1 36.934238 3010.44 73.267477 183\n", | |
"show We -1 36.319055 13484.29 167.221876 388\n", | |
"paper This -1 35.956122 342586.69 782.409055 1953\n", | |
"The paper 1 35.098383 7977.44 138.716516 295" | |
] | |
} | |
], | |
"prompt_number": 46 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### collocation Dataframe \u7684\u5404\u6b04\u4f4d\u7d71\u8a08\u6578\u64da" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"collocations_df.describe()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>strength</th>\n", | |
" <th>spread</th>\n", | |
" <th>peak</th>\n", | |
" <th>p</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td> 5580.000000</td>\n", | |
" <td> 5580.000000</td>\n", | |
" <td> 5580.000000</td>\n", | |
" <td> 5580.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td> 7.710192</td>\n", | |
" <td> 386.050444</td>\n", | |
" <td> 11.896760</td>\n", | |
" <td> 24.913620</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td> 5.061042</td>\n", | |
" <td> 8563.116375</td>\n", | |
" <td> 24.451848</td>\n", | |
" <td> 60.060065</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td> 1.011534</td>\n", | |
" <td> 10.010000</td>\n", | |
" <td> 4.400000</td>\n", | |
" <td> 5.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td> 3.875809</td>\n", | |
" <td> 14.090000</td>\n", | |
" <td> 5.810243</td>\n", | |
" <td> 12.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td> 6.722563</td>\n", | |
" <td> 23.040000</td>\n", | |
" <td> 7.570744</td>\n", | |
" <td> 15.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td> 10.322662</td>\n", | |
" <td> 51.840000</td>\n", | |
" <td> 11.200000</td>\n", | |
" <td> 23.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td> 48.541987</td>\n", | |
" <td> 342586.690000</td>\n", | |
" <td> 782.409055</td>\n", | |
" <td> 1953.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 47, | |
"text": [ | |
" strength spread peak p\n", | |
"count 5580.000000 5580.000000 5580.000000 5580.000000\n", | |
"mean 7.710192 386.050444 11.896760 24.913620\n", | |
"std 5.061042 8563.116375 24.451848 60.060065\n", | |
"min 1.011534 10.010000 4.400000 5.000000\n", | |
"25% 3.875809 14.090000 5.810243 12.000000\n", | |
"50% 6.722563 23.040000 7.570744 15.000000\n", | |
"75% 10.322662 51.840000 11.200000 23.000000\n", | |
"max 48.541987 342586.690000 782.409055 1953.000000" | |
] | |
} | |
], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### role \u7684\u76f8\u95dc collocations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"collocations_df.loc['role']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>strength</th>\n", | |
" <th>spread</th>\n", | |
" <th>peak</th>\n", | |
" <th>p</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>collocate</th>\n", | |
" <th>distance</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>The</th>\n", | |
" <th>-1</th>\n", | |
" <td> 5.058224</td>\n", | |
" <td> 19.49</td>\n", | |
" <td> 6.314748</td>\n", | |
" <td> 15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>central</th>\n", | |
" <th>-1</th>\n", | |
" <td> 3.306837</td>\n", | |
" <td> 10.61</td>\n", | |
" <td> 4.557299</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>important</th>\n", | |
" <th>-1</th>\n", | |
" <td> 11.771874</td>\n", | |
" <td> 134.96</td>\n", | |
" <td> 15.817229</td>\n", | |
" <td> 39</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>play</th>\n", | |
" <th>-3</th>\n", | |
" <td> 19.945013</td>\n", | |
" <td> 221.00</td>\n", | |
" <td> 21.866069</td>\n", | |
" <td> 51</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>plays</th>\n", | |
" <th>-3</th>\n", | |
" <td> 7.393407</td>\n", | |
" <td> 34.01</td>\n", | |
" <td> 8.531809</td>\n", | |
" <td> 20</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 48, | |
"text": [ | |
" strength spread peak p\n", | |
"collocate distance \n", | |
"The -1 5.058224 19.49 6.314748 15\n", | |
"central -1 3.306837 10.61 4.557299 11\n", | |
"important -1 11.771874 134.96 15.817229 39\n", | |
"play -3 19.945013 221.00 21.866069 51\n", | |
"plays -3 7.393407 34.01 8.531809 20" | |
] | |
} | |
], | |
"prompt_number": 48 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### strength \u7684\u6578\u64da\u5206\u4f48" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"collocations_df.strength.hist(bins=100, log=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 49, | |
"text": [ | |
"<matplotlib.axes.AxesSubplot at 0x117f5f7d0>" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "display_data", | |
"png": "iVBORw0KGgoAAAANSUhEUgAAAXkAAAEDCAYAAADQunSaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEddJREFUeJzt3X+M5PVdx/HnsHuXwnHLsf6ggpcs6UEFQ7INDfijjWNR\ng/EHSWuCJBrujK39gyoxUawxYTb+4Y+YSCIxprGwxCjQxmqPmIo1dggJsYrepiigcLlNgCOnyR3e\nerRyHOsf35l5z+7t7c73MzP7/fV8JJub78x8Zz73ur33fL/v72e+X5AkSZIkSZIkSZIkSZIkSZIk\nSZKkCpiZwmt+D/DbwL3AHHBsCu8hSSrYZcAXih6EJGlnjwCngBc23X8n8DLwCvDA0P0/BXwF+Piu\njE6SNJaPAh9iY5GfAV4FFoA9wApw06b1vrwbg5MkbW12xOc9S1bMh91GVuRXe8tPAHcB30m2Bf8+\n4Gtjj1CSlGzUIr+V64DXhpZfB24Hnun9SJIKNk6RXx/nja+99tr1kydPjvMSktREx4FDoz75sjHe\n6A3g4NDyQbKt+ZGcPHmS9fV1f9bXefDBBwsfQ1l+zMIszGL7H+ADeQr1OEX+eeAGsl79XuBu4OgY\nr9dYq6urRQ+hNMwimEUwi3SjFvnHgeeAG8n68EeAd4H7gKeBF4EngZfyvHmn06Hb7eZZRZIaqdvt\n0ul0cq/XmvxQRrbe2/VovG63S7vdLnoYpWAWwSyCWYRWqwU5ardFXpIqJG+RH6cnrwmxZRXMIphF\nMIt0hRZ5e/KSNBp78pLUALZrJEkDFvkSsGUVzCKYRTCLdPbkJakC7MlLUgPYk5ckDVjkS8CWVTCL\nYBbBLNJZ5CWpxmYKfO9O/8bCwkJxoyiBpv/9h5lFMItgFtnezPLyMs888wzA0qjreeBVkirEA68V\nZL8xmEUwi2AW6SzyklRjtmskqUJs10iSBjytQQmYQTCLYBbBLNJPazA7+aGMLmXAktRE7XabdrvN\n0tLIsycBe/KSVCn25CVJAxb5ErDfGMwimEUwi3QWeUmqMXvyklQheXvynqBMkirAE5RVWLfbpd1u\nFz2MUjCLYBbBLIKzayRJA27JS1KFuCUvSRqwyJeAc4CDWQSzCGaRziIvSTVmT16SKsSevCRpwCJf\nAvYbg1kEswhmkc6LhkhSBaReNMSevCRViD15SdKARb4EbFkFswhmEcwinUVekmqslj35ubl51tbO\n9Jb2AOcB2L//as6ePT2V95Sk3dCYnvzc3DytVotWq8Xc3PyGx7ICv977OT+4HYV/+/UlqS4qW+SH\nC/na2tqgYPc+5XKuf+aSz9uNDwP7jcEsglkEs0g3W/QAJuNdsoLdN7kuVHwYwNpakd0tScqvUj35\njb12iMLe4uIiv9Vje8g+EC5ef3gs272Pc/slFSlvT750Rf7YsWOcOHFisHzHHXdw1VVXAf2/3FbF\ne9QiP2rx55LrWOQlFanyRf6DH/wwb7yxn5mZq3nnnX8BTvOtb/3v8Gq9PydZ5PPvCUxypo7Xrwxm\nEcwimEWo/OyaCxfWOXfuDzh79kvMzHysV+D7M2WK1u/9O1NHUjUUeuC10+nQbrcr/wk97sHZqv/9\nJ8ksglkEs8j2ZlJmGZWuXXPo0K0cP/454Fb27TvCuXPLTK7dMtn1++PfeKxgY3/fL2BJmqTKt2uq\nL1o6O83B73MOcDCLYBbBLNJZ5JPNjvjlq1n79ZIKY7tm4utv91rTmZ0jqTls15Ta8OycNbfwJU2d\nRb4wW0/HbDp7r8Esglmks8hLUo1Z5FUqzocOZhHMIp1FXpJqzCKvUrH3GswimEU6i7wk1ZhFXqVi\n7zWYRTCLdBb5UvBbsZKmwyJfCs6Z77P3GswimEU6i3zpzG64KHmrtdetfEnJpnU++buAnwDmgM8D\nX53S+9TQVhclb86FxO29BrMIZpFuWlvyXwY+BXwauHtK79FAs27hS8olT5F/BDgFvLDp/juBl4FX\ngAc2PfZbwMPJo9Mmw+eqP08d+/j2XoNZBLNIl6fIP0pW0IfNkBXxO4GbgXuAm8h6DL8HfAVYGX+Y\nkqQUeYr8s8DmTcbbgFeBVbJNyyfI+vH3AXcAPwP80tij1MiqflFxe6/BLIJZpBv3wOt1wGtDy68D\ntwOfAf5ozNdWgnEvKi6pXsYt8hdf2imHw4cPs7CwAMCBAwdYXFwcerTLhQtvbli+WBdob3p88/Ju\nr3+p5093/Yt7lhuX+4/3t4jKuty/ryzjKXJ5ZWWF+++/vzTjKXL5oYceYnFxsTTj2c3lbrfL8vIy\nwKBe5pF3U28BeAq4pbf8fUCH6NV/FniPrB+/kwZe/m966/ezzC4NdvH9VdHtdt017zGLYBZhty//\n9zxwA1nx30s2XfLomK+pBvM/cjCLYBbp8hT5x4HngBvJ+vBHyOb03Qc8DbwIPAm8NOoLdjqdDbvp\nShXz5yXVU7fbpdPp5F6vyKpgu2YXxmK7prrMIphF2O12jSSpxCzyKhW31oJZBLNIV2iRtycvSaNJ\n7ckXXuT9hNYwP/SDWQSzyPZmKlfkJUnTZZFXqbhnF8wimEU6i7wk1VjhPXl7bdNUvUsJ+vsQzCKY\nRfqB12ld/m8kKQNWHs2+lKBUJ+12m3a7zdLSUq71bNeoVOy9BrMIZpHOIi9JNWaRV6nYew1mEcwi\nnQdeJakCPAtlic78WJWxVO0MlZI8C6UkaYhFXqVi+y6YRTCLdBb5xpot/RejJI2v8C9D9Sf4a7fF\nF6XK9MUofxeCWQSzyPZmUvZoPPA68fWrOZb+v8Xc3Dxra2cA2L//as6ePY2k8vDAq8aSFfh1YH1Q\n7HeTvddgFsEs0lnkNbK5uXn7+FLFFNqTV7XEVv70+vj2XoNZBLNI55a8GJ5pk7KOW/VSeVnkRcy0\nyfMN2Fhnkr17e6/BLIJZpLPIS1KNeYIylYq912AWwSw8QRlVnJtehbEM/xtlPfudnydpepwnr0pz\nzy6YRTCLdBZ5Saoxi7xKxd5rMItgFuks8pJUYxZ5bSO+8JTvi1Lp7L0Gswhmkc7TGmgbcTriTHlO\nSSxpNG7Jq1TsvQazCGaRzi9DSVIFpH4ZqvAi7ye0hvmhH8wimEW2N1O5Iq+68IyUUll54FUTMLnr\nxbpnF8wimEU6t+QlqcYs8ioVe6/BLIJZpLPIS1KNWeQ1YeMdhLX3GswimEU6D7xqwiZ3EFbS+NyS\nV6nYew1mEcwinUVekmrMIq8pyt+ft/cazCKYRTp78poi+/NS0Qo/d429tqbYeG76S23Z+/sQzCKY\nRfoJygrdkk8ZsKpq47np3bKX8mm327TbbZaWlnKtZ09epWLvNZhFMIt0FnlJqjGLvErF3mswi2AW\n6Zxdo4LMDl0cfA9wHoDLL7+St99eK2xUUt1Y5FWQ4QOxrcHtb37TA7J99qGDWaSzXaPSmpubH5p2\nuXekKZiSNrLIq7TW1s6QbeGvk7Vz1gc/2WP1Zh86mEU62zUqmcuGevWSxuWWvErmPWKLvdnsQwez\nSGeRl6Qas8hLJWUfOphFOou8JNWYRV4qKfvQwSzSWeQlqcYs8lJJ2YcOZpFuGkX+euBPgS9O4bUl\nSTlMo8ifAH5xCq8rNYp96GAW6UYt8o8Ap4AXNt1/J/Ay8ArwwATHJe0g/0XCpSYatcg/SlbQh80A\nD/fuvxm4B7hpckOTttM/i2V9z2NjHzqYRbpRi/yzwOb/SbcBrwKrZGePegK4C5gH/gRYxK17SSrU\nOCcouw54bWj5deB24DTw6VFe4PDhwywsLABw4MABFhcXhx7tcuHCmxuWL9YF2pse37y82+tf6vlN\nX3/z8ye5flyAZP/+qzl79vRgy6/fy63q8uBvWpLxFLXcv68s49nN5W63y/LyMsCgXuaR53R/C8BT\nwC295U+QtWo+2Vv+ObIi/5kRX299ff3ik1AdOnQrx49/DriVffuOcO7cMltdXOLSt0d93rTWdyxF\nj2Wr3yupLnobNCPX7nFm17wBHBxaPki2NS9pAuxDB7NIN06Rfx64gWwLfy9wN3B0AmOSJE3IqEX+\nceA54EayPvwRsukN9wFPAy8CTwIv5XnzTqfjJ7QmrD5TK50bHswi25vpdDq51yvyEjz25B3L1Ne3\nP6+62c2evKQpci83mEU6i7wk1VihRd6evKar2v15+9DBLOzJ0/Tes2OxP69msCcv1YR7ucEs0lnk\nJanG7MlLJWUfOpiFPXmK7vc6lvKPxZ686sCevFQT7uUGs0hnkZekGrPISyVlHzqYRToPvEpSBXjg\n1QOMjmWH21U78Dp8JaSmM4vggVdJ0oBFXiopt1yDWaSzyEtSjVnkpZJyUkIwi3TOrlHjzM3ND05B\nXNXTEKt5nF3jLBLHssPt/u9bNjth42tVbeaNmsvZNZKkAYu8VFK2MoNZpLPIS1KNWeSlknJueDCL\ndBZ5Saoxp1BKzG45nXJ4qmUR0yz9vxHMIn0K5ezkhzK6lAFLk/cu/SmVa2sxM21t7cyW90tFaLfb\ntNttlpaWcq1nu0YqKfvQwSzSWeQlqcYs8lJJ2YcOZpHOIi9JNWaRl0rKPnQwi3QWeUmqMYu8VFL2\noYNZpCt8nnx/7qc0XbP9U7ROYP09wPnBI/v3X83Zs6cvWmNubr431/7Sz9luncsvv5K3314bY8yq\nk263m/Rh5/nkJ76+Y6n6WLY+7/z262/1u7x5/VHOWZ+yjprF88lLkgYs8pJKz558Oou8JNWYRV5S\n6Tk5I51FXpJqzCIvqfTsyaezyEtSjVnkJZWePfl0FnlJqjGLvKTSsyefziIvSTVWaJHvdDp+QqsG\nspOXtVot5ubmd3z23Nx8ruenrj/u+0zSuGOxJ5/tzXQ6ndzreYKyia/vWKo+lpQTlI2y/k737/RY\nnufked5uKNNYqs4TlEmqHff401nkJanGLPKSSs+efDqLvCTVmEVeUunZk09nkZekGrPISyo9e/Lp\nLPKSVGMWeUmlZ08+nUVekmrMIi+p9OzJp7PIS1KNWeQllZ49+XQWeUmqMYu8pNKzJ59udgqvuQ/4\nY+D/gC7wF1N4D0nSCKaxJf9x4AvAp4CfnsLrS2oYe/LpRi3yjwCngBc23X8n8DLwCvBA777rgNd6\nty+MO0BJWllZKXoIlTVqkX+UrKAPmwEe7t1/M3APcBPwOnAw5+tL0iW99dZbRQ+hskYtws8CZzbd\ndxvwKrAKnAeeAO4CvgR8gqwvf3Qio5QkJRnnwOtwWwayLfjbgbeBXxhnUJI0bHV1teghVNY4RX7c\ny60fb7VaH9j6oQ8DcO5cf3n4wuSj3E5ZZ5LrO5Yqj6XVms76o92/02N5npPnebthvLE89thjkxxM\nlR3P8+RxivwbRO+d3u3Xc6x/aIz3liRN2AIbZ9fMkn2iLAB7gRWyA6+SpIp5HDhJ9gWn14Ajvft/\nHPgPsgOwny1maJKkMtlqbn1TbPV9g3ngq8B/An8HHChgXEU4CHwN+Hfg34Bf7t3fxDzeB3ydbG/4\nReB3evc3MYu+GeAY8FRvualZrALfIMvin3r3lTqLGbKt/gVgD81r8XwU+BAbi/zvA7/eu/0A8Lu7\nPaiCvB9Y7N2+kmyP8Caam8cVvT9ngX8EPkJzswD4VeDPiWnYTc3iBFlRH1bqLL4f+Nuh5d/o/TTJ\nAhuL/MvANb3b7+8tN9FfAz+CeVwB/DPwvTQ3i+8G/h74YWJLvqlZnAC+bdN9ubLY7W+kbjW3/rpd\nHkPZXEPWwqH35zXbPLeuFsj2cL5Oc/O4jGzP9hTRxmpqFn8I/Brw3tB9Tc1inewD73ngk737cmUx\njbNQbmfcufV1t07zMroS+EvgV4C1TY81KY/3yNpXVwFPk23FDmtKFj8J/BdZD7p9iec0JQuAHwTe\nBL6DrA+/eat9xyx2e0t+3Ln1dXSKbJcL4LvIfsGbYg9Zgf8zsnYNNDsPgP8B/ga4lWZm8QNkZ689\nQTar72Nkvx9NzAKyAg/w38BfkZ1OJlcWu13knwduIObW343ntzkK3Nu7fS9R7OquBXyebDbJQ0P3\nNzGPbydmSFwO/CjZlmwTs/hNso2/64GfBf4B+HmamcUVwP7e7X3Aj5Edzyt9Fk2eW9//vsE7xPcN\n5sl6bqWcDjVFHyFrUayQFbRjZNNrm5jHLcC/kmXxDbJ+NDQzi2E/RGwENjGL68l+J1bIphn362UT\ns5AkSZIkSZIkSZIkSZIkSZIkSZJUd/8PiTGvi7g7o0gAAAAASUVORK5CYII=\n", | |
"text": [ | |
"<matplotlib.figure.Figure at 0x117f1eb90>" | |
] | |
} | |
], | |
"prompt_number": 49 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### \u8907\u96dc\u7be9\u9078\u8655\u7406\u793a\u7bc4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"collocations_df[ collocations_df.index.map(lambda x: x[2] == 1) & (collocations_df.spread > 800) \n", | |
" \n", | |
" ].sort(columns='strength', ascending=False)[:20]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>strength</th>\n", | |
" <th>spread</th>\n", | |
" <th>peak</th>\n", | |
" <th>p</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>base word</th>\n", | |
" <th>collocate</th>\n", | |
" <th>distance</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>This</th>\n", | |
" <th>paper</th>\n", | |
" <th>1</th>\n", | |
" <td> 48.541987</td>\n", | |
" <td> 342586.69</td>\n", | |
" <td> 782.409055</td>\n", | |
" <td> 1953</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>The</th>\n", | |
" <th>paper</th>\n", | |
" <th>1</th>\n", | |
" <td> 35.098383</td>\n", | |
" <td> 7977.44</td>\n", | |
" <td> 138.716516</td>\n", | |
" <td> 295</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>e</th>\n", | |
" <th>#</th>\n", | |
" <th>1</th>\n", | |
" <td> 34.011620</td>\n", | |
" <td> 11877.84</td>\n", | |
" <td> 150.385504</td>\n", | |
" <td> 368</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>We</th>\n", | |
" <th>present</th>\n", | |
" <th>1</th>\n", | |
" <td> 32.978409</td>\n", | |
" <td> 43512.69</td>\n", | |
" <td> 284.696956</td>\n", | |
" <td> 700</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>large</th>\n", | |
" <th>number</th>\n", | |
" <th>1</th>\n", | |
" <td> 28.867409</td>\n", | |
" <td> 1197.04</td>\n", | |
" <td> 47.998266</td>\n", | |
" <td> 117</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>case</th>\n", | |
" <th>study</th>\n", | |
" <th>1</th>\n", | |
" <td> 26.060464</td>\n", | |
" <td> 1423.96</td>\n", | |
" <td> 50.535395</td>\n", | |
" <td> 126</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>di</th>\n", | |
" <th>#</th>\n", | |
" <th>1</th>\n", | |
" <td> 25.816074</td>\n", | |
" <td> 9893.61</td>\n", | |
" <td> 135.166628</td>\n", | |
" <td> 334</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>results</th>\n", | |
" <th>show</th>\n", | |
" <th>1</th>\n", | |
" <td> 25.182837</td>\n", | |
" <td> 4401.69</td>\n", | |
" <td> 97.245233</td>\n", | |
" <td> 229</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>For</th>\n", | |
" <th>example</th>\n", | |
" <th>1</th>\n", | |
" <td> 23.615373</td>\n", | |
" <td> 880.20</td>\n", | |
" <td> 39.668165</td>\n", | |
" <td> 99</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>information</th>\n", | |
" <th>systems</th>\n", | |
" <th>1</th>\n", | |
" <td> 23.330641</td>\n", | |
" <td> 830.89</td>\n", | |
" <td> 42.925163</td>\n", | |
" <td> 100</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>wide</th>\n", | |
" <th>range</th>\n", | |
" <th>1</th>\n", | |
" <td> 22.344460</td>\n", | |
" <td> 1147.04</td>\n", | |
" <td> 45.267979</td>\n", | |
" <td> 113</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>We</th>\n", | |
" <th>show</th>\n", | |
" <th>1</th>\n", | |
" <td> 22.075193</td>\n", | |
" <td> 13484.29</td>\n", | |
" <td> 167.221876</td>\n", | |
" <td> 388</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>experimental</th>\n", | |
" <th>results</th>\n", | |
" <th>1</th>\n", | |
" <td> 22.038362</td>\n", | |
" <td> 1994.29</td>\n", | |
" <td> 60.757474</td>\n", | |
" <td> 150</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>The</th>\n", | |
" <th>results</th>\n", | |
" <th>1</th>\n", | |
" <td> 21.055918</td>\n", | |
" <td> 3710.96</td>\n", | |
" <td> 90.717649</td>\n", | |
" <td> 205</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Our</th>\n", | |
" <th>approach</th>\n", | |
" <th>1</th>\n", | |
" <td> 19.135100</td>\n", | |
" <td> 1465.65</td>\n", | |
" <td> 52.783809</td>\n", | |
" <td> 129</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>The</th>\n", | |
" <th>system</th>\n", | |
" <th>1</th>\n", | |
" <td> 18.691626</td>\n", | |
" <td> 1435.45</td>\n", | |
" <td> 64.387333</td>\n", | |
" <td> 115</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>et</th>\n", | |
" <th>al</th>\n", | |
" <th>1</th>\n", | |
" <td> 18.461401</td>\n", | |
" <td> 2851.56</td>\n", | |
" <td> 71.200000</td>\n", | |
" <td> 178</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>We</th>\n", | |
" <th>also</th>\n", | |
" <th>1</th>\n", | |
" <td> 18.019197</td>\n", | |
" <td> 13829.36</td>\n", | |
" <td> 159.398299</td>\n", | |
" <td> 394</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">The</th>\n", | |
" <th>model</th>\n", | |
" <th>1</th>\n", | |
" <td> 17.688593</td>\n", | |
" <td> 1312.49</td>\n", | |
" <td> 61.328304</td>\n", | |
" <td> 115</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>method</th>\n", | |
" <th>1</th>\n", | |
" <td> 17.402012</td>\n", | |
" <td> 1398.41</td>\n", | |
" <td> 62.095321</td>\n", | |
" <td> 107</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 50, | |
"text": [ | |
" strength spread peak p\n", | |
"base word collocate distance \n", | |
"This paper 1 48.541987 342586.69 782.409055 1953\n", | |
"The paper 1 35.098383 7977.44 138.716516 295\n", | |
"e # 1 34.011620 11877.84 150.385504 368\n", | |
"We present 1 32.978409 43512.69 284.696956 700\n", | |
"large number 1 28.867409 1197.04 47.998266 117\n", | |
"case study 1 26.060464 1423.96 50.535395 126\n", | |
"di # 1 25.816074 9893.61 135.166628 334\n", | |
"results show 1 25.182837 4401.69 97.245233 229\n", | |
"For example 1 23.615373 880.20 39.668165 99\n", | |
"information systems 1 23.330641 830.89 42.925163 100\n", | |
"wide range 1 22.344460 1147.04 45.267979 113\n", | |
"We show 1 22.075193 13484.29 167.221876 388\n", | |
"experimental results 1 22.038362 1994.29 60.757474 150\n", | |
"The results 1 21.055918 3710.96 90.717649 205\n", | |
"Our approach 1 19.135100 1465.65 52.783809 129\n", | |
"The system 1 18.691626 1435.45 64.387333 115\n", | |
"et al 1 18.461401 2851.56 71.200000 178\n", | |
"We also 1 18.019197 13829.36 159.398299 394\n", | |
"The model 1 17.688593 1312.49 61.328304 115\n", | |
" method 1 17.402012 1398.41 62.095321 107" | |
] | |
} | |
], | |
"prompt_number": 50 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!gist nlplab\\ lab04\\ 50000.ipynb --update b5c08188f53d7540c0e3" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"https://gist.github.com/b5c08188f53d7540c0e3\r\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# BONUS!!!\n", | |
"\u4f7f\u7528 `citeseerx_descriptions_sents.txt.100M.genia` \u627e\u51fa VN \u95dc\u4fc2\u7684 collocations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 21 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment