Instantly share code, notes, and snippets.
Last active
May 29, 2017 23:13
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save shotahorii/7961735ded18df871d76de462aefbef0 to your computer and use it in GitHub Desktop.
Find tags from keyword and get its documentation in SEC data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Find tags from keyword and get its documentation in SEC data\n", | |
"#### Pre-requirement\n", | |
"- [SECdataLoader](https://gist.github.com/shotahorii/71e1d1b450c9774e7b96d8c57afef809) is executed before.\n", | |
"- [sqlitehandler](https://gist.github.com/shotahorii/274b832570205e439bd54a3a8ce0bde6) is downloaded." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import functools as ft\n", | |
"import time\n", | |
"from sqlitehandler import SqliteHandler" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Define constants" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"db = SqliteHandler('sec')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Tag documentation table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# get distinct tag names\n", | |
"df_distinctTags = db.table2df(\"select distinct tag from tag;\")\n", | |
"df_distinctTags.columns = ['tag'] # add header names\n", | |
"df_distinctTags = df_distinctTags[df_distinctTags['tag'].isnull()==False].reset_index(drop=True) # remove null" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# get all the tag data\n", | |
"df_all = db.table2df(\"select tag, file, doc from tag;\")\n", | |
"df_all.columns = ['tag', 'file', 'doc']\n", | |
"df_all = df_all[df_all['tag'].isnull()==False].reset_index(drop=True) # remove null" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# extract latest non null docs for each tag\n", | |
"df_all_nonNull = df_all[df_all['doc'].isnull()==False]\n", | |
"df_all_nonNull_latestFile = df_all_nonNull[['tag', 'file']].groupby(by='tag').max().reset_index()\n", | |
"df_all_nonNull_latestDoc = pd.merge(df_all_nonNull, df_all_nonNull_latestFile, how='inner', on=['tag', 'file'])\n", | |
"df_distinctDocs = df_all_nonNull_latestDoc.drop_duplicates('tag')[['tag', 'doc']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_tagdocs = pd.merge(df_distinctTags, df_distinctDocs, how='left', on='tag')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Add record count to see popularity of the tag" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_tagCount = db.table2df(\"select tag, count(*) from num group by tag;\")\n", | |
"df_tagCount.columns = ['tag', 'count']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_tagmaster = pd.merge(df_tagdocs, df_tagCount, how='outer', on='tag')\n", | |
"df_tagmaster['count'].fillna(0, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"272097\n", | |
"218691\n", | |
"272981\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(df_tagdocs))\n", | |
"print(len(df_tagCount))\n", | |
"print(len(df_tagmaster))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>tag</th>\n", | |
" <th>doc</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AccountsAndOtherReceivables</td>\n", | |
" <td>Amount due from customers, clients, other thir...</td>\n", | |
" <td>18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AccountsNotesAndLoansReceivableNetCurrent</td>\n", | |
" <td>The aggregate of amounts due from customers or...</td>\n", | |
" <td>6680</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AccountsPayable</td>\n", | |
" <td>Represents the Accounts payable, during the in...</td>\n", | |
" <td>2811</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" tag \\\n", | |
"0 AccountsAndOtherReceivables \n", | |
"1 AccountsNotesAndLoansReceivableNetCurrent \n", | |
"2 AccountsPayable \n", | |
"\n", | |
" doc count \n", | |
"0 Amount due from customers, clients, other thir... 18 \n", | |
"1 The aggregate of amounts due from customers or... 6680 \n", | |
"2 Represents the Accounts payable, during the in... 2811 " | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_tagmaster.head(3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"#### Search function" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def search(partial):\n", | |
" alltags = df_tagmaster['tag'].tolist()\n", | |
" matched = list(filter(lambda x: str(partial).lower() in x.lower(), alltags))\n", | |
" df_matched = pd.DataFrame(matched, columns=['tag'])\n", | |
" return pd.merge(df_matched, df_tagmaster, how='inner', on='tag') \\\n", | |
" .sort_values('count', ascending=False).reset_index(drop=True)\n", | |
"\n", | |
"def getDoc(tagname):\n", | |
" try: return df_tagmaster[df_tagmaster['tag']==tagname]['doc'].tolist()[0]\n", | |
" except: print('No such tag')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>tag</th>\n", | |
" <th>doc</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Revenues</td>\n", | |
" <td>Amount of revenue recognized from goods sold, ...</td>\n", | |
" <td>363921</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>SalesRevenueNet</td>\n", | |
" <td>Total revenue from sale of goods and services ...</td>\n", | |
" <td>245801</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>SalesRevenueGoodsNet</td>\n", | |
" <td>Aggregate revenue during the period from the s...</td>\n", | |
" <td>115644</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" tag doc \\\n", | |
"0 Revenues Amount of revenue recognized from goods sold, ... \n", | |
"1 SalesRevenueNet Total revenue from sale of goods and services ... \n", | |
"2 SalesRevenueGoodsNet Aggregate revenue during the period from the s... \n", | |
"\n", | |
" count \n", | |
"0 363921 \n", | |
"1 245801 \n", | |
"2 115644 " | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"search('revenue').head(3) # show top 3 hits" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Amount of revenue recognized from goods sold, services rendered, insurance premiums, or other activities that constitute an earning process. Includes, but is not limited to, investment and interest income before deduction of interest expense when recognized as a component of revenue, and sales and trading gain (loss).'" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"getDoc('Revenues')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment