shotahorii · May 29, 2017 23:13
diff --git a/SECdata_TagSearcher.ipynb b/SECdata_TagSearcher.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Find tags from keyword and get its documentation in SEC data\n",
    "#### Pre-requirement\n",
    "- [SECdataLoader](https://gist.github.com/shotahorii/71e1d1b450c9774e7b96d8c57afef809) is executed before.\n",
    "- [sqlitehandler](https://gist.github.com/shotahorii/274b832570205e439bd54a3a8ce0bde6) is downloaded."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import functools as ft\n",
    "import time\n",
    "from sqlitehandler import SqliteHandler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Define constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "db = SqliteHandler('sec')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tag documentation table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# get distinct tag names\n",
    "df_distinctTags = db.table2df(\"select distinct tag from tag;\")\n",
    "df_distinctTags.columns = ['tag'] # add header names\n",
    "df_distinctTags = df_distinctTags[df_distinctTags['tag'].isnull()==False].reset_index(drop=True) # remove null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# get all the tag data\n",
    "df_all = db.table2df(\"select tag, file, doc from tag;\")\n",
    "df_all.columns = ['tag', 'file', 'doc']\n",
    "df_all = df_all[df_all['tag'].isnull()==False].reset_index(drop=True) # remove null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# extract latest non null docs for each tag\n",
    "df_all_nonNull = df_all[df_all['doc'].isnull()==False]\n",
    "df_all_nonNull_latestFile = df_all_nonNull[['tag', 'file']].groupby(by='tag').max().reset_index()\n",
    "df_all_nonNull_latestDoc = pd.merge(df_all_nonNull, df_all_nonNull_latestFile, how='inner', on=['tag', 'file'])\n",
    "df_distinctDocs = df_all_nonNull_latestDoc.drop_duplicates('tag')[['tag', 'doc']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_tagdocs = pd.merge(df_distinctTags, df_distinctDocs, how='left', on='tag')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Add record count to see popularity of the tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_tagCount = db.table2df(\"select tag, count(*) from num group by tag;\")\n",
    "df_tagCount.columns = ['tag', 'count']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_tagmaster = pd.merge(df_tagdocs, df_tagCount, how='outer', on='tag')\n",
    "df_tagmaster['count'].fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "272097\n",
      "218691\n",
      "272981\n"
     ]
    }
   ],
   "source": [
    "print(len(df_tagdocs))\n",
    "print(len(df_tagCount))\n",
    "print(len(df_tagmaster))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag</th>\n",
       "      <th>doc</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AccountsAndOtherReceivables</td>\n",
       "      <td>Amount due from customers, clients, other thir...</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AccountsNotesAndLoansReceivableNetCurrent</td>\n",
       "      <td>The aggregate of amounts due from customers or...</td>\n",
       "      <td>6680</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AccountsPayable</td>\n",
       "      <td>Represents the Accounts payable, during the in...</td>\n",
       "      <td>2811</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         tag  \\\n",
       "0                AccountsAndOtherReceivables   \n",
       "1  AccountsNotesAndLoansReceivableNetCurrent   \n",
       "2                            AccountsPayable   \n",
       "\n",
       "                                                 doc  count  \n",
       "0  Amount due from customers, clients, other thir...     18  \n",
       "1  The aggregate of amounts due from customers or...   6680  \n",
       "2  Represents the Accounts payable, during the in...   2811  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tagmaster.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### Search function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def search(partial):\n",
    "    alltags = df_tagmaster['tag'].tolist()\n",
    "    matched = list(filter(lambda x: str(partial).lower() in x.lower(), alltags))\n",
    "    df_matched = pd.DataFrame(matched, columns=['tag'])\n",
    "    return pd.merge(df_matched, df_tagmaster, how='inner', on='tag') \\\n",
    "    .sort_values('count', ascending=False).reset_index(drop=True)\n",
    "\n",
    "def getDoc(tagname):\n",
    "    try: return df_tagmaster[df_tagmaster['tag']==tagname]['doc'].tolist()[0]\n",
    "    except: print('No such tag')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag</th>\n",
       "      <th>doc</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Revenues</td>\n",
       "      <td>Amount of revenue recognized from goods sold, ...</td>\n",
       "      <td>363921</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>SalesRevenueNet</td>\n",
       "      <td>Total revenue from sale of goods and services ...</td>\n",
       "      <td>245801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SalesRevenueGoodsNet</td>\n",
       "      <td>Aggregate revenue during the period from the s...</td>\n",
       "      <td>115644</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    tag                                                doc  \\\n",
       "0              Revenues  Amount of revenue recognized from goods sold, ...   \n",
       "1       SalesRevenueNet  Total revenue from sale of goods and services ...   \n",
       "2  SalesRevenueGoodsNet  Aggregate revenue during the period from the s...   \n",
       "\n",
       "    count  \n",
       "0  363921  \n",
       "1  245801  \n",
       "2  115644  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "search('revenue').head(3) # show top 3 hits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Amount of revenue recognized from goods sold, services rendered, insurance premiums, or other activities that constitute an earning process. Includes, but is not limited to, investment and interest income before deduction of interest expense when recognized as a component of revenue, and sales and trading gain (loss).'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "getDoc('Revenues')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Find tags from keyword and get its documentation in SEC data\n",
	"#### Pre-requirement\n",
	"- [SECdataLoader](https://gist.github.com/shotahorii/71e1d1b450c9774e7b96d8c57afef809) is executed before.\n",
	"- [sqlitehandler](https://gist.github.com/shotahorii/274b832570205e439bd54a3a8ce0bde6) is downloaded."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import functools as ft\n",
	"import time\n",
	"from sqlitehandler import SqliteHandler"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Define constants"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"db = SqliteHandler('sec')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Tag documentation table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# get distinct tag names\n",
	"df_distinctTags = db.table2df(\"select distinct tag from tag;\")\n",
	"df_distinctTags.columns = ['tag'] # add header names\n",
	"df_distinctTags = df_distinctTags[df_distinctTags['tag'].isnull()==False].reset_index(drop=True) # remove null"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# get all the tag data\n",
	"df_all = db.table2df(\"select tag, file, doc from tag;\")\n",
	"df_all.columns = ['tag', 'file', 'doc']\n",
	"df_all = df_all[df_all['tag'].isnull()==False].reset_index(drop=True) # remove null"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# extract latest non null docs for each tag\n",
	"df_all_nonNull = df_all[df_all['doc'].isnull()==False]\n",
	"df_all_nonNull_latestFile = df_all_nonNull[['tag', 'file']].groupby(by='tag').max().reset_index()\n",
	"df_all_nonNull_latestDoc = pd.merge(df_all_nonNull, df_all_nonNull_latestFile, how='inner', on=['tag', 'file'])\n",
	"df_distinctDocs = df_all_nonNull_latestDoc.drop_duplicates('tag')[['tag', 'doc']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_tagdocs = pd.merge(df_distinctTags, df_distinctDocs, how='left', on='tag')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Add record count to see popularity of the tag"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_tagCount = db.table2df(\"select tag, count(*) from num group by tag;\")\n",
	"df_tagCount.columns = ['tag', 'count']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df_tagmaster = pd.merge(df_tagdocs, df_tagCount, how='outer', on='tag')\n",
	"df_tagmaster['count'].fillna(0, inplace=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"272097\n",
	"218691\n",
	"272981\n"
	]
	}
	],
	"source": [
	"print(len(df_tagdocs))\n",
	"print(len(df_tagCount))\n",
	"print(len(df_tagmaster))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>tag</th>\n",
	" <th>doc</th>\n",
	" <th>count</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>AccountsAndOtherReceivables</td>\n",
	" <td>Amount due from customers, clients, other thir...</td>\n",
	" <td>18</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>AccountsNotesAndLoansReceivableNetCurrent</td>\n",
	" <td>The aggregate of amounts due from customers or...</td>\n",
	" <td>6680</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>AccountsPayable</td>\n",
	" <td>Represents the Accounts payable, during the in...</td>\n",
	" <td>2811</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" tag \\\n",
	"0 AccountsAndOtherReceivables \n",
	"1 AccountsNotesAndLoansReceivableNetCurrent \n",
	"2 AccountsPayable \n",
	"\n",
	" doc count \n",
	"0 Amount due from customers, clients, other thir... 18 \n",
	"1 The aggregate of amounts due from customers or... 6680 \n",
	"2 Represents the Accounts payable, during the in... 2811 "
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_tagmaster.head(3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"#### Search function"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def search(partial):\n",
	" alltags = df_tagmaster['tag'].tolist()\n",
	" matched = list(filter(lambda x: str(partial).lower() in x.lower(), alltags))\n",
	" df_matched = pd.DataFrame(matched, columns=['tag'])\n",
	" return pd.merge(df_matched, df_tagmaster, how='inner', on='tag') \\\n",
	" .sort_values('count', ascending=False).reset_index(drop=True)\n",
	"\n",
	"def getDoc(tagname):\n",
	" try: return df_tagmaster[df_tagmaster['tag']==tagname]['doc'].tolist()[0]\n",
	" except: print('No such tag')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>tag</th>\n",
	" <th>doc</th>\n",
	" <th>count</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Revenues</td>\n",
	" <td>Amount of revenue recognized from goods sold, ...</td>\n",
	" <td>363921</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>SalesRevenueNet</td>\n",
	" <td>Total revenue from sale of goods and services ...</td>\n",
	" <td>245801</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>SalesRevenueGoodsNet</td>\n",
	" <td>Aggregate revenue during the period from the s...</td>\n",
	" <td>115644</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" tag doc \\\n",
	"0 Revenues Amount of revenue recognized from goods sold, ... \n",
	"1 SalesRevenueNet Total revenue from sale of goods and services ... \n",
	"2 SalesRevenueGoodsNet Aggregate revenue during the period from the s... \n",
	"\n",
	" count \n",
	"0 363921 \n",
	"1 245801 \n",
	"2 115644 "
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"search('revenue').head(3) # show top 3 hits"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Amount of revenue recognized from goods sold, services rendered, insurance premiums, or other activities that constitute an earning process. Includes, but is not limited to, investment and interest income before deduction of interest expense when recognized as a component of revenue, and sales and trading gain (loss).'"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"getDoc('Revenues')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}