-
-
Save jgc128/1cfeba9ddbf52b83a2fa to your computer and use it in GitHub Desktop.
Non-Negative matrix factorization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:cc620a258bec1c319225fa966334f0ce1c92696660fd9982aad2ced2ed69099c" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import re\n", | |
"import urllib\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", | |
"from sklearn.feature_selection import VarianceThreshold\n", | |
"from sklearn.decomposition import NMF" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"/usr/local/lib/python3.4/dist-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.\n", | |
" .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open(\"set2.txt\") as f:\n", | |
" content = f.readlines()\n", | |
"\n", | |
"cleaned_data = [urllib.parse.unquote_plus(c[:-1]).lower() for c in content if len(c) > 1]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"regExp = re.compile(r\"[|,\\.!@#$%^&*()\\s\\\"']*\")\n", | |
"def tokenize(text):\n", | |
" # lambda doc: [tok.strip().lower() for tok in doc.split(' ') if len(tok) > 1]\n", | |
" data = regExp.split(text)\n", | |
" return [tok.lower() for tok in data if len(tok) >= 1 and not tok.isdigit()]\n", | |
"\n", | |
"vectorizer = TfidfVectorizer(min_df=2, tokenizer = tokenize)\n", | |
"tfidf = vectorizer.fit_transform(cleaned_data)\n", | |
"\n", | |
"# selector = VarianceThreshold(0.05)\n", | |
"# tfidf = selector.fit_transform(tfidf)\n", | |
"\n", | |
"tfidf" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"<80830x20076 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 610274 stored elements in Compressed Sparse Row format>" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"nmf = NMF(n_components=10, random_state=1)\n", | |
"nmf.fit(tfidf)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"NMF(beta=1, eta=0.1, init=None, max_iter=200, n_components=10,\n", | |
" nls_max_iter=2000, random_state=1, sparseness=None, tol=0.0001)" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(nmf.components_.shape)\n", | |
"print(nmf.reconstruction_err_)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"(10, 20076)\n", | |
"225.93780911404065\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"features = vectorizer.get_feature_names()\n", | |
"for idx, comp in enumerate(nmf.components_):\n", | |
" print(\"Component #%d:\" % idx)\n", | |
" topFeaturesIndex = comp.argsort()[:-20:-1] # get top 20 [ <first element to include> : <first element to exclude> : <step>]\n", | |
" res = [str(comp[i]) + ' ' + features[i] for i in topFeaturesIndex if comp[i] > 0.1]\n", | |
" print(' | '.join(res))\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Component #0:\n", | |
"11.9987931069 null | 0.384649655658 null-- | 0.204426869517 null\u0000 | 0.155941480833 all | 0.118161981255 where | 0.112360055086 foo | 0.106263597417 union | 0.104343020231 select\n", | |
"Component #1:\n", | |
"4.76131956566 all | 4.1385210344 union | 3.70219844906 select | 0.883003525244 where | 0.59188598274 + | 0.502541585596 null-- | 0.272183841721 like | 0.263243483255 dual | 0.239928373931 = | 0.205954630608 mode | 0.205898410438 boolean | 0.196552827752 null\u0000 | 0.191402967566 in | 0.128306990339 version | 0.110090846142 from\n", | |
"Component #2:\n", | |
"6.43775347876 chr | 0.507381656369 receive_message | 0.441648936864 dual | 0.319968876584 when | 0.319606776309 case | 0.319583449136 then | 0.319551272121 end | 0.304387782167 else | 0.279975851096 from | 0.166059852914 ::text | 0.165125096967 numeric | 0.163958618536 get_host_address | 0.162803266443 dbms_pipe | 0.161537568972 or | 0.161415318901 sn | 0.161415318901 drithsx | 0.143510459911 xmltype | 0.119819528357 select | 0.10604982085 upper\n", | |
"Component #3:\n", | |
"3.37742152694 version | 3.28871693393 user | 3.02336357317 0x3a | 3.02232290032 database | 2.71776530811 concat_ws | 1.29754275301 union | 1.03344553977 -1 | 1.00120939953 select | 0.819801199034 -- | 0.621463597518 version_compile_os | 0.433257528132 concat | 0.334425936133 / | 0.254625642389 1=0 | 0.246290655443 char | 0.217425352934 password | 0.205060543245 0x3a3a | 0.148422535953 mysql | 0.147643944763 5-- | 0.147465531691 1=2\n", | |
"Component #4:\n", | |
"5.08135886392 and | 2.08203776196 = | 1.81304968596 like | 0.879870037792 foo | 0.430278574756 sleep | 0.310148150537 or | 0.266591033301 1=0 | 0.244518048707 pg_sleep | 0.240318175172 randomblob | 0.240318175172 abcdefg | 0.218424647454 upper | 0.217807024724 hex | 0.209605525197 generate_series | 0.204745548669 not | 0.180405722659 md5 | 0.173808582048 -- | 0.155211124295 all_users | 0.13097276018 count | 0.130119092636 null\u0000\n", | |
"Component #5:\n", | |
"3.58957048678 ; | 2.280246459 if | 2.20277556194 drop | 2.20220678044 function | 1.1330452364 sleep | 1.11962487375 foo | 0.753042105163 else | 0.751631652739 select | 0.580373800559 where | 0.482699633016 begin | 0.438444890413 -- | 0.430659489951 + | 0.429750535055 ;-- | 0.284700299065 rlike | 0.28443049589 0x28 | 0.281735835636 end;-- | 0.269170268038 dbms_lock | 0.266532740363 user_lock | 0.253969040389 end--\n", | |
"Component #6:\n", | |
"4.62714311684 as | 2.09281618106 rdb | 1.74303964771 sysusers | 1.2875093143 domain | 1.28576280354 count | 1.23811055811 t2 | 1.23811055811 t1 | 1.17148947089 t3 | 1.12044316248 sysibm | 1.12044316248 systables | 0.930295926397 all_users | 0.835710649477 from | 0.636937882758 fields | 0.636937882758 types | 0.636534203625 collations | 0.429031255562 domains | 0.39619622731 t4 | 0.35437438129 select | 0.33828700673 functions\n", | |
"Component #7:\n", | |
"2.93077372045 from | 2.01585263674 information_schema | 1.80465212726 limit | 1.77086668858 -- | 1.71650144854 0x71 | 1.44108638828 table_name | 1.23921631717 select | 1.17755832446 tables | 1.15225516905 concat | 1.03413747544 password | 0.985288007282 where | 0.882171553235 aes_encrypt | 0.882171553235 aes_decrypt | 0.879058732462 column_name | 0.8302361587 union | 0.786730342951 columns | 0.73475385426 0x7873716c696e6a656e64 | 0.734424467862 0x7873716c696e6a626567696e | 0.723868780223 -1\n", | |
"Component #8:\n", | |
"5.91167104508 +char | 2.25810728987 char | 0.644321593841 + | 0.465786395891 when | 0.465785714031 then | 0.465780299644 case | 0.465742543591 end | 0.446844874032 else | 0.347765112391 int | 0.31360923732 in | 0.152341855267 or | 0.148774278874 select | 0.106072759302 where | 0.104563261427 concat\n", | |
"Component #9:\n", | |
"2.69395065088 by | 2.5296447614 when | 2.5296126982 then | 2.52941897697 end | 2.52917991637 case | 2.43626068291 else | 2.16582649507 x | 1.7307241326 order | 1.69558549259 concat | 1.40993643516 select | 1.36423954591 group | 1.33299655951 rand | 1.32629700591 floor | 1.15467345655 a | 1.14912229761 or | 0.83938795519 from | 0.79558660506 1/ | 0.744853084004 count | 0.675190349298 character_sets\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment