Skip to content

Instantly share code, notes, and snippets.

@jgc128
Last active August 29, 2015 14:08
Show Gist options
  • Save jgc128/1cfeba9ddbf52b83a2fa to your computer and use it in GitHub Desktop.
Save jgc128/1cfeba9ddbf52b83a2fa to your computer and use it in GitHub Desktop.
Non-Negative matrix factorization
{
"metadata": {
"name": "",
"signature": "sha256:cc620a258bec1c319225fa966334f0ce1c92696660fd9982aad2ced2ed69099c"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import re\n",
"import urllib\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"from sklearn.feature_selection import VarianceThreshold\n",
"from sklearn.decomposition import NMF"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/local/lib/python3.4/dist-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.\n",
" .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open(\"set2.txt\") as f:\n",
" content = f.readlines()\n",
"\n",
"cleaned_data = [urllib.parse.unquote_plus(c[:-1]).lower() for c in content if len(c) > 1]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"regExp = re.compile(r\"[|,\\.!@#$%^&*()\\s\\\"']*\")\n",
"def tokenize(text):\n",
" # lambda doc: [tok.strip().lower() for tok in doc.split(' ') if len(tok) > 1]\n",
" data = regExp.split(text)\n",
" return [tok.lower() for tok in data if len(tok) >= 1 and not tok.isdigit()]\n",
"\n",
"vectorizer = TfidfVectorizer(min_df=2, tokenizer = tokenize)\n",
"tfidf = vectorizer.fit_transform(cleaned_data)\n",
"\n",
"# selector = VarianceThreshold(0.05)\n",
"# tfidf = selector.fit_transform(tfidf)\n",
"\n",
"tfidf"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"<80830x20076 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 610274 stored elements in Compressed Sparse Row format>"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nmf = NMF(n_components=10, random_state=1)\n",
"nmf.fit(tfidf)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"NMF(beta=1, eta=0.1, init=None, max_iter=200, n_components=10,\n",
" nls_max_iter=2000, random_state=1, sparseness=None, tol=0.0001)"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(nmf.components_.shape)\n",
"print(nmf.reconstruction_err_)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(10, 20076)\n",
"225.93780911404065\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"features = vectorizer.get_feature_names()\n",
"for idx, comp in enumerate(nmf.components_):\n",
" print(\"Component #%d:\" % idx)\n",
" topFeaturesIndex = comp.argsort()[:-20:-1] # get top 20 [ <first element to include> : <first element to exclude> : <step>]\n",
" res = [str(comp[i]) + ' ' + features[i] for i in topFeaturesIndex if comp[i] > 0.1]\n",
" print(' | '.join(res))\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Component #0:\n",
"11.9987931069 null | 0.384649655658 null-- | 0.204426869517 null\u0000 | 0.155941480833 all | 0.118161981255 where | 0.112360055086 foo | 0.106263597417 union | 0.104343020231 select\n",
"Component #1:\n",
"4.76131956566 all | 4.1385210344 union | 3.70219844906 select | 0.883003525244 where | 0.59188598274 + | 0.502541585596 null-- | 0.272183841721 like | 0.263243483255 dual | 0.239928373931 = | 0.205954630608 mode | 0.205898410438 boolean | 0.196552827752 null\u0000 | 0.191402967566 in | 0.128306990339 version | 0.110090846142 from\n",
"Component #2:\n",
"6.43775347876 chr | 0.507381656369 receive_message | 0.441648936864 dual | 0.319968876584 when | 0.319606776309 case | 0.319583449136 then | 0.319551272121 end | 0.304387782167 else | 0.279975851096 from | 0.166059852914 ::text | 0.165125096967 numeric | 0.163958618536 get_host_address | 0.162803266443 dbms_pipe | 0.161537568972 or | 0.161415318901 sn | 0.161415318901 drithsx | 0.143510459911 xmltype | 0.119819528357 select | 0.10604982085 upper\n",
"Component #3:\n",
"3.37742152694 version | 3.28871693393 user | 3.02336357317 0x3a | 3.02232290032 database | 2.71776530811 concat_ws | 1.29754275301 union | 1.03344553977 -1 | 1.00120939953 select | 0.819801199034 -- | 0.621463597518 version_compile_os | 0.433257528132 concat | 0.334425936133 / | 0.254625642389 1=0 | 0.246290655443 char | 0.217425352934 password | 0.205060543245 0x3a3a | 0.148422535953 mysql | 0.147643944763 5-- | 0.147465531691 1=2\n",
"Component #4:\n",
"5.08135886392 and | 2.08203776196 = | 1.81304968596 like | 0.879870037792 foo | 0.430278574756 sleep | 0.310148150537 or | 0.266591033301 1=0 | 0.244518048707 pg_sleep | 0.240318175172 randomblob | 0.240318175172 abcdefg | 0.218424647454 upper | 0.217807024724 hex | 0.209605525197 generate_series | 0.204745548669 not | 0.180405722659 md5 | 0.173808582048 -- | 0.155211124295 all_users | 0.13097276018 count | 0.130119092636 null\u0000\n",
"Component #5:\n",
"3.58957048678 ; | 2.280246459 if | 2.20277556194 drop | 2.20220678044 function | 1.1330452364 sleep | 1.11962487375 foo | 0.753042105163 else | 0.751631652739 select | 0.580373800559 where | 0.482699633016 begin | 0.438444890413 -- | 0.430659489951 + | 0.429750535055 ;-- | 0.284700299065 rlike | 0.28443049589 0x28 | 0.281735835636 end;-- | 0.269170268038 dbms_lock | 0.266532740363 user_lock | 0.253969040389 end--\n",
"Component #6:\n",
"4.62714311684 as | 2.09281618106 rdb | 1.74303964771 sysusers | 1.2875093143 domain | 1.28576280354 count | 1.23811055811 t2 | 1.23811055811 t1 | 1.17148947089 t3 | 1.12044316248 sysibm | 1.12044316248 systables | 0.930295926397 all_users | 0.835710649477 from | 0.636937882758 fields | 0.636937882758 types | 0.636534203625 collations | 0.429031255562 domains | 0.39619622731 t4 | 0.35437438129 select | 0.33828700673 functions\n",
"Component #7:\n",
"2.93077372045 from | 2.01585263674 information_schema | 1.80465212726 limit | 1.77086668858 -- | 1.71650144854 0x71 | 1.44108638828 table_name | 1.23921631717 select | 1.17755832446 tables | 1.15225516905 concat | 1.03413747544 password | 0.985288007282 where | 0.882171553235 aes_encrypt | 0.882171553235 aes_decrypt | 0.879058732462 column_name | 0.8302361587 union | 0.786730342951 columns | 0.73475385426 0x7873716c696e6a656e64 | 0.734424467862 0x7873716c696e6a626567696e | 0.723868780223 -1\n",
"Component #8:\n",
"5.91167104508 +char | 2.25810728987 char | 0.644321593841 + | 0.465786395891 when | 0.465785714031 then | 0.465780299644 case | 0.465742543591 end | 0.446844874032 else | 0.347765112391 int | 0.31360923732 in | 0.152341855267 or | 0.148774278874 select | 0.106072759302 where | 0.104563261427 concat\n",
"Component #9:\n",
"2.69395065088 by | 2.5296447614 when | 2.5296126982 then | 2.52941897697 end | 2.52917991637 case | 2.43626068291 else | 2.16582649507 x | 1.7307241326 order | 1.69558549259 concat | 1.40993643516 select | 1.36423954591 group | 1.33299655951 rand | 1.32629700591 floor | 1.15467345655 a | 1.14912229761 or | 0.83938795519 from | 0.79558660506 1/ | 0.744853084004 count | 0.675190349298 character_sets\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment