Skip to content

Instantly share code, notes, and snippets.

@shotahorii
Last active May 29, 2017 23:13
Show Gist options
  • Save shotahorii/7961735ded18df871d76de462aefbef0 to your computer and use it in GitHub Desktop.
Save shotahorii/7961735ded18df871d76de462aefbef0 to your computer and use it in GitHub Desktop.
Find tags from keyword and get its documentation in SEC data
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find tags from keyword and get its documentation in SEC data\n",
"#### Pre-requirement\n",
"- [SECdataLoader](https://gist.github.com/shotahorii/71e1d1b450c9774e7b96d8c57afef809) is executed before.\n",
"- [sqlitehandler](https://gist.github.com/shotahorii/274b832570205e439bd54a3a8ce0bde6) is downloaded."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import functools as ft\n",
"import time\n",
"from sqlitehandler import SqliteHandler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Define constants"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"db = SqliteHandler('sec')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tag documentation table"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# get distinct tag names\n",
"df_distinctTags = db.table2df(\"select distinct tag from tag;\")\n",
"df_distinctTags.columns = ['tag'] # add header names\n",
"df_distinctTags = df_distinctTags[df_distinctTags['tag'].isnull()==False].reset_index(drop=True) # remove null"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"# get all the tag data\n",
"df_all = db.table2df(\"select tag, file, doc from tag;\")\n",
"df_all.columns = ['tag', 'file', 'doc']\n",
"df_all = df_all[df_all['tag'].isnull()==False].reset_index(drop=True) # remove null"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# extract latest non null docs for each tag\n",
"df_all_nonNull = df_all[df_all['doc'].isnull()==False]\n",
"df_all_nonNull_latestFile = df_all_nonNull[['tag', 'file']].groupby(by='tag').max().reset_index()\n",
"df_all_nonNull_latestDoc = pd.merge(df_all_nonNull, df_all_nonNull_latestFile, how='inner', on=['tag', 'file'])\n",
"df_distinctDocs = df_all_nonNull_latestDoc.drop_duplicates('tag')[['tag', 'doc']]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_tagdocs = pd.merge(df_distinctTags, df_distinctDocs, how='left', on='tag')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Add record count to see popularity of the tag"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_tagCount = db.table2df(\"select tag, count(*) from num group by tag;\")\n",
"df_tagCount.columns = ['tag', 'count']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_tagmaster = pd.merge(df_tagdocs, df_tagCount, how='outer', on='tag')\n",
"df_tagmaster['count'].fillna(0, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"272097\n",
"218691\n",
"272981\n"
]
}
],
"source": [
"print(len(df_tagdocs))\n",
"print(len(df_tagCount))\n",
"print(len(df_tagmaster))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag</th>\n",
" <th>doc</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AccountsAndOtherReceivables</td>\n",
" <td>Amount due from customers, clients, other thir...</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AccountsNotesAndLoansReceivableNetCurrent</td>\n",
" <td>The aggregate of amounts due from customers or...</td>\n",
" <td>6680</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AccountsPayable</td>\n",
" <td>Represents the Accounts payable, during the in...</td>\n",
" <td>2811</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag \\\n",
"0 AccountsAndOtherReceivables \n",
"1 AccountsNotesAndLoansReceivableNetCurrent \n",
"2 AccountsPayable \n",
"\n",
" doc count \n",
"0 Amount due from customers, clients, other thir... 18 \n",
"1 The aggregate of amounts due from customers or... 6680 \n",
"2 Represents the Accounts payable, during the in... 2811 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_tagmaster.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"#### Search function"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def search(partial):\n",
" alltags = df_tagmaster['tag'].tolist()\n",
" matched = list(filter(lambda x: str(partial).lower() in x.lower(), alltags))\n",
" df_matched = pd.DataFrame(matched, columns=['tag'])\n",
" return pd.merge(df_matched, df_tagmaster, how='inner', on='tag') \\\n",
" .sort_values('count', ascending=False).reset_index(drop=True)\n",
"\n",
"def getDoc(tagname):\n",
" try: return df_tagmaster[df_tagmaster['tag']==tagname]['doc'].tolist()[0]\n",
" except: print('No such tag')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag</th>\n",
" <th>doc</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Revenues</td>\n",
" <td>Amount of revenue recognized from goods sold, ...</td>\n",
" <td>363921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>SalesRevenueNet</td>\n",
" <td>Total revenue from sale of goods and services ...</td>\n",
" <td>245801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SalesRevenueGoodsNet</td>\n",
" <td>Aggregate revenue during the period from the s...</td>\n",
" <td>115644</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag doc \\\n",
"0 Revenues Amount of revenue recognized from goods sold, ... \n",
"1 SalesRevenueNet Total revenue from sale of goods and services ... \n",
"2 SalesRevenueGoodsNet Aggregate revenue during the period from the s... \n",
"\n",
" count \n",
"0 363921 \n",
"1 245801 \n",
"2 115644 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"search('revenue').head(3) # show top 3 hits"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Amount of revenue recognized from goods sold, services rendered, insurance premiums, or other activities that constitute an earning process. Includes, but is not limited to, investment and interest income before deduction of interest expense when recognized as a component of revenue, and sales and trading gain (loss).'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"getDoc('Revenues')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment