jxnl · October 3, 2016 15:56
diff --git a/collect_combiner.ipynb b/collect_combiner.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from itertools import combinations\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class CollectCombiner(BaseEstimator, TransformerMixin):\n",
    "    \"\"\"\n",
    "    CollectCombiner\n",
    "    ~~~~~~~~~~\n",
    "    CollectCombiner is a Transformer for Pipeline objects.\n",
    "    Usage:\n",
    "        Initialize the CollectCombiner with a `key` and `value` column names \n",
    "        Assumes that each row is a has a key and value and then groups by collects\n",
    "        the values into a list. Afterwards it will generate combinations of the list \n",
    "        of values to produce n-gram like pairs for vectorization.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, key, value, n):\n",
    "        self.k = key\n",
    "        self.v = value\n",
    "        self.n = n\n",
    "        self.vectorizer = CountVectorizer(\n",
    "            tokenizer=self._combiner, \n",
    "            preprocessor=self._I)\n",
    "        \n",
    "    def _collecter(self, x):\n",
    "        return list(x)\n",
    "    \n",
    "    def _I(self, x):\n",
    "        return x\n",
    "    \n",
    "    def _combiner(self, elems):\n",
    "        def combination_generator():\n",
    "            x = sorted(elems)\n",
    "            for i in xrange(1, self.n+1):\n",
    "                for combination in combinations(x, i):\n",
    "                    yield combination\n",
    "        return list(combination_generator())\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        X_new = X.groupby(self.k).agg({self.v: self._collecter})\n",
    "        self.vectorizer.fit(X_new[self.v])\n",
    "        \n",
    "    def transform(self, X, y=None):\n",
    "        X_new = X.groupby(self.k).agg({self.v: self._collecter})\n",
    "        return self.vectorizer.transform(X_new[self.v])\n",
    "\n",
    "    def fit_transform(self, X, y=None, **fit_params):\n",
    "        X_new = X.groupby(self.k).agg({self.v: self._collecter})\n",
    "        self.vectorizer.fit(X_new[self.v])\n",
    "        return self.vectorizer.transform(X_new[self.v])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   key  value\n",
       "0    1      1\n",
       "1    1      1\n",
       "2    2      3\n",
       "3    2      4\n",
       "4    2      5\n",
       "5    3      6\n",
       "6    4      7"
      ]
     },
     "execution_count": 217,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({\"key\":[1,1,2,2,2,3,4], \"value\":[1,1,3,4,5,6,7]})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "c = CollectCombiner(\"key\", \"value\", 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>(1,)</th>\n",
       "      <th>(1, 1)</th>\n",
       "      <th>(3,)</th>\n",
       "      <th>(3, 4)</th>\n",
       "      <th>(3, 5)</th>\n",
       "      <th>(4,)</th>\n",
       "      <th>(4, 5)</th>\n",
       "      <th>(5,)</th>\n",
       "      <th>(6,)</th>\n",
       "      <th>(7,)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   (1,)  (1, 1)  (3,)  (3, 4)  (3, 5)  (4,)  (4, 5)  (5,)  (6,)  (7,)\n",
       "1     2       1     0       0       0     0       0     0     0     0\n",
       "2     0       0     1       1       1     1       1     1     0     0\n",
       "3     0       0     0       0       0     0       0     0     1     0\n",
       "4     0       0     0       0       0     0       0     0     0     1"
      ]
     },
     "execution_count": 219,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(\n",
    "    c.fit_transform(df).todense(), \n",
    "    index=[1,2,3,4], \n",
    "    columns={v:k for k,v in c.vectorizer.vocabulary_.items()}.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 89,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"\n",
	"from sklearn.base import BaseEstimator, TransformerMixin\n",
	"from itertools import combinations\n",
	"\n",
	"from sklearn.feature_extraction.text import CountVectorizer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 216,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"class CollectCombiner(BaseEstimator, TransformerMixin):\n",
	" \"\"\"\n",
	" CollectCombiner\n",
	" ~~~~~~~~~~\n",
	" CollectCombiner is a Transformer for Pipeline objects.\n",
	" Usage:\n",
	" Initialize the CollectCombiner with a `key` and `value` column names \n",
	" Assumes that each row is a has a key and value and then groups by collects\n",
	" the values into a list. Afterwards it will generate combinations of the list \n",
	" of values to produce n-gram like pairs for vectorization.\n",
	" \"\"\"\n",
	"\n",
	" def __init__(self, key, value, n):\n",
	" self.k = key\n",
	" self.v = value\n",
	" self.n = n\n",
	" self.vectorizer = CountVectorizer(\n",
	" tokenizer=self._combiner, \n",
	" preprocessor=self._I)\n",
	" \n",
	" def _collecter(self, x):\n",
	" return list(x)\n",
	" \n",
	" def _I(self, x):\n",
	" return x\n",
	" \n",
	" def _combiner(self, elems):\n",
	" def combination_generator():\n",
	" x = sorted(elems)\n",
	" for i in xrange(1, self.n+1):\n",
	" for combination in combinations(x, i):\n",
	" yield combination\n",
	" return list(combination_generator())\n",
	"\n",
	" def fit(self, X, y=None):\n",
	" X_new = X.groupby(self.k).agg({self.v: self._collecter})\n",
	" self.vectorizer.fit(X_new[self.v])\n",
	" \n",
	" def transform(self, X, y=None):\n",
	" X_new = X.groupby(self.k).agg({self.v: self._collecter})\n",
	" return self.vectorizer.transform(X_new[self.v])\n",
	"\n",
	" def fit_transform(self, X, y=None, **fit_params):\n",
	" X_new = X.groupby(self.k).agg({self.v: self._collecter})\n",
	" self.vectorizer.fit(X_new[self.v])\n",
	" return self.vectorizer.transform(X_new[self.v])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 217,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>key</th>\n",
	" <th>value</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>2</td>\n",
	" <td>4</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>2</td>\n",
	" <td>5</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>3</td>\n",
	" <td>6</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>4</td>\n",
	" <td>7</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" key value\n",
	"0 1 1\n",
	"1 1 1\n",
	"2 2 3\n",
	"3 2 4\n",
	"4 2 5\n",
	"5 3 6\n",
	"6 4 7"
	]
	},
	"execution_count": 217,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.DataFrame({\"key\":[1,1,2,2,2,3,4], \"value\":[1,1,3,4,5,6,7]})\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 218,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"c = CollectCombiner(\"key\", \"value\", 2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 219,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>(1,)</th>\n",
	" <th>(1, 1)</th>\n",
	" <th>(3,)</th>\n",
	" <th>(3, 4)</th>\n",
	" <th>(3, 5)</th>\n",
	" <th>(4,)</th>\n",
	" <th>(4, 5)</th>\n",
	" <th>(5,)</th>\n",
	" <th>(6,)</th>\n",
	" <th>(7,)</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" (1,) (1, 1) (3,) (3, 4) (3, 5) (4,) (4, 5) (5,) (6,) (7,)\n",
	"1 2 1 0 0 0 0 0 0 0 0\n",
	"2 0 0 1 1 1 1 1 1 0 0\n",
	"3 0 0 0 0 0 0 0 0 1 0\n",
	"4 0 0 0 0 0 0 0 0 0 1"
	]
	},
	"execution_count": 219,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.DataFrame(\n",
	" c.fit_transform(df).todense(), \n",
	" index=[1,2,3,4], \n",
	" columns={v:k for k,v in c.vectorizer.vocabulary_.items()}.values())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}