Created
October 22, 2014 15:36
-
-
Save jgc128/f99587b392152b989f9a to your computer and use it in GitHub Desktop.
correlation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:e7dfda19c647e9d3dc4e94180c19589cc96dada50b5952a9b5d6c318c26c65e0" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import re\n", | |
"import urllib\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", | |
"from sklearn.feature_selection import VarianceThreshold\n", | |
"from sklearn.cluster import KMeans, MiniBatchKMeans\n", | |
"from sklearn.decomposition import TruncatedSVD\n", | |
"from sklearn.preprocessing import Normalizer\n", | |
"from sklearn.pipeline import make_pipeline\n", | |
"from sklearn import svm\n", | |
"from sklearn import tree\n", | |
"from sklearn.externals.six import StringIO" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"/usr/local/lib/python3.4/dist-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.\n", | |
" .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open(\"set2.txt\") as f:\n", | |
" content = f.readlines()\n", | |
"\n", | |
"cleaned_data = [urllib.parse.unquote_plus(c[:-1]).lower() for c in content if len(c) > 1]\n", | |
"data = pd.Series(cleaned_data)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"factor_select = data.map(lambda x: x.find(\"select \") != -1)\n", | |
"factor_union = data.map(lambda x: x.find(\"union \") != -1)\n", | |
"factor_groupby = data.map(lambda x: x.find(\"group by\") != -1)\n", | |
"factor_where = data.map(lambda x: x.find(\"where\") != -1)\n", | |
"factor_comment = data.map(lambda x: x.find(\"/*\") != -1 or x.find(\"--\") != -1)\n", | |
"factor_insert = data.map(lambda x: x.find(\"insert\") != -1)\n", | |
"factor_brackets = data.map(lambda x: float(x.count(\"(\") + x.count(\")\")) / len(x) >= 0.04)\n", | |
"factor_sqlwords = data.map(lambda x: float(x.count(\"select\") + x.count(\"union\") + x.count(\"from\") + x.count(\"where\") + x.count(\"or\") + x.count(\"=\") + x.count(\"and\")) / len(x) >= 0.04)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 40 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"factors = pd.DataFrame({\n", | |
" 'select': factor_select,\n", | |
" 'union': factor_union,\n", | |
" 'groupby': factor_groupby,\n", | |
" 'where': factor_where,\n", | |
" 'brackets': factor_brackets,\n", | |
" 'sqlwords': factor_sqlwords,\n", | |
" 'comment': factor_comment,\n", | |
" 'insert': factor_insert\n", | |
"})" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 78 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"factors.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>brackets</th>\n", | |
" <th>comment</th>\n", | |
" <th>groupby</th>\n", | |
" <th>insert</th>\n", | |
" <th>select</th>\n", | |
" <th>sqlwords</th>\n", | |
" <th>union</th>\n", | |
" <th>where</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> True</td>\n", | |
" <td> True</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" <td> False</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 95, | |
"text": [ | |
" brackets comment groupby insert select sqlwords union where\n", | |
"0 True True False False True False True False\n", | |
"1 True True False False True True False False\n", | |
"2 True True False False True True False False\n", | |
"3 True True False False False False False False\n", | |
"4 True True False False False False False False" | |
] | |
} | |
], | |
"prompt_number": 95 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for fc in factors.columns: \n", | |
" print(fc, data[~factors[fc]].size)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"brackets 51569\n", | |
"comment 41390\n", | |
"groupby 79548\n", | |
"insert 80824\n", | |
"select 9249\n", | |
"sqlwords 54628\n", | |
"union 23304\n", | |
"where 62037\n" | |
] | |
} | |
], | |
"prompt_number": 87 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"factors.corr()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>brackets</th>\n", | |
" <th>comment</th>\n", | |
" <th>groupby</th>\n", | |
" <th>insert</th>\n", | |
" <th>select</th>\n", | |
" <th>sqlwords</th>\n", | |
" <th>union</th>\n", | |
" <th>where</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>brackets</th>\n", | |
" <td> 1.000000</td>\n", | |
" <td>-0.004200</td>\n", | |
" <td> 0.161319</td>\n", | |
" <td> 0.002474</td>\n", | |
" <td>-0.295321</td>\n", | |
" <td> 0.305836</td>\n", | |
" <td>-0.572733</td>\n", | |
" <td>-0.179849</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>comment</th>\n", | |
" <td>-0.004200</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td>-0.069641</td>\n", | |
" <td> 0.005954</td>\n", | |
" <td> 0.080779</td>\n", | |
" <td>-0.029609</td>\n", | |
" <td> 0.135064</td>\n", | |
" <td> 0.000245</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>groupby</th>\n", | |
" <td> 0.161319</td>\n", | |
" <td>-0.069641</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td>-0.001094</td>\n", | |
" <td> 0.033812</td>\n", | |
" <td> 0.144587</td>\n", | |
" <td>-0.084468</td>\n", | |
" <td>-0.019238</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>insert</th>\n", | |
" <td> 0.002474</td>\n", | |
" <td> 0.005954</td>\n", | |
" <td>-0.001094</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td> 0.003097</td>\n", | |
" <td>-0.005967</td>\n", | |
" <td>-0.004027</td>\n", | |
" <td> 0.005456</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>select</th>\n", | |
" <td>-0.295321</td>\n", | |
" <td> 0.080779</td>\n", | |
" <td> 0.033812</td>\n", | |
" <td> 0.003097</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td>-0.364833</td>\n", | |
" <td> 0.558327</td>\n", | |
" <td> 0.079711</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>sqlwords</th>\n", | |
" <td> 0.305836</td>\n", | |
" <td>-0.029609</td>\n", | |
" <td> 0.144587</td>\n", | |
" <td>-0.005967</td>\n", | |
" <td>-0.364833</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td>-0.410019</td>\n", | |
" <td> 0.091794</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>union</th>\n", | |
" <td>-0.572733</td>\n", | |
" <td> 0.135064</td>\n", | |
" <td>-0.084468</td>\n", | |
" <td>-0.004027</td>\n", | |
" <td> 0.558327</td>\n", | |
" <td>-0.410019</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td> 0.003180</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>where</th>\n", | |
" <td>-0.179849</td>\n", | |
" <td> 0.000245</td>\n", | |
" <td>-0.019238</td>\n", | |
" <td> 0.005456</td>\n", | |
" <td> 0.079711</td>\n", | |
" <td> 0.091794</td>\n", | |
" <td> 0.003180</td>\n", | |
" <td> 1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 86, | |
"text": [ | |
" brackets comment groupby insert select sqlwords \\\n", | |
"brackets 1.000000 -0.004200 0.161319 0.002474 -0.295321 0.305836 \n", | |
"comment -0.004200 1.000000 -0.069641 0.005954 0.080779 -0.029609 \n", | |
"groupby 0.161319 -0.069641 1.000000 -0.001094 0.033812 0.144587 \n", | |
"insert 0.002474 0.005954 -0.001094 1.000000 0.003097 -0.005967 \n", | |
"select -0.295321 0.080779 0.033812 0.003097 1.000000 -0.364833 \n", | |
"sqlwords 0.305836 -0.029609 0.144587 -0.005967 -0.364833 1.000000 \n", | |
"union -0.572733 0.135064 -0.084468 -0.004027 0.558327 -0.410019 \n", | |
"where -0.179849 0.000245 -0.019238 0.005456 0.079711 0.091794 \n", | |
"\n", | |
" union where \n", | |
"brackets -0.572733 -0.179849 \n", | |
"comment 0.135064 0.000245 \n", | |
"groupby -0.084468 -0.019238 \n", | |
"insert -0.004027 0.005456 \n", | |
"select 0.558327 0.079711 \n", | |
"sqlwords -0.410019 0.091794 \n", | |
"union 1.000000 0.003180 \n", | |
"where 0.003180 1.000000 " | |
] | |
} | |
], | |
"prompt_number": 86 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data[(factor_select | factor_sqlwords)].size" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 97, | |
"text": [ | |
"78973" | |
] | |
} | |
], | |
"prompt_number": 97 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"selData = factor_select | factor_sqlwords" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 92 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"factors.corrwith(selData)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 93, | |
"text": [ | |
"brackets -0.112160\n", | |
"comment -0.016338\n", | |
"groupby 0.000299\n", | |
"insert 0.001321\n", | |
"select 0.426597\n", | |
"sqlwords 0.106200\n", | |
"union 0.235275\n", | |
"where 0.068761\n", | |
"dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 93 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data[(factor_select | factor_sqlwords | factor_brackets)].size" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 98, | |
"text": [ | |
"80298" | |
] | |
} | |
], | |
"prompt_number": 98 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment