Last active
May 30, 2017 00:11
-
-
Save shotahorii/71e1d1b450c9774e7b96d8c57afef809 to your computer and use it in GitHub Desktop.
Loading SEC data from 30+ tsv files into 3 sqlite tables.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Loading SEC data to sqlite\n", | |
"\n", | |
"#### Data\n", | |
"- Data source (SEC) : https://www.sec.gov/dera/data/financial-statement-data-sets.html\n", | |
"- Documentation: https://www.sec.gov/data/financial-statements/aqfs.pdf\n", | |
"\n", | |
"#### Pre-requestment\n", | |
"- Downloaded all fnancial statement data sets from SEC (url above), and stored under the path \"./data/sec/\"\n", | |
"- Downloaded the module for handling sqlite from here: https://gist.github.com/shotahorii/274b832570205e439bd54a3a8ce0bde6" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from os import listdir\n", | |
"from sqlitehandler import SqliteHandler" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Define constant params" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['2009q1', '2009q2', '2009q3', '2009q4', '2010q1', '2010q2', '2010q3', '2010q4', '2011q1', '2011q2', '2011q3', '2011q4', '2012q1', '2012q2', '2012q3', '2012q4', '2013q1', '2013q2', '2013q3', '2013q4', '2014q1', '2014q2', '2014q3', '2014q4', '2015q1', '2015q2', '2015q3', '2015q4', '2016q1', '2016q2', '2016q3', '2016q4', '2017q1']\n" | |
] | |
} | |
], | |
"source": [ | |
"FILEPATH_SUB = './data/sec/{}/sub.txt' # {} must replaced by foldername such as '2017q1'\n", | |
"FILEPATH_NUM = './data/sec/{}/num.txt'\n", | |
"FILEPATH_TAG = './data/sec/{}/tag.txt'\n", | |
"\n", | |
"FOLDERS = list(filter(lambda x: x[0]!='.', listdir('./data/sec/')))\n", | |
"print(FOLDERS) # each folder name indicates when containing statements are filed\n", | |
"\n", | |
"DBNAME = 'sec'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Define fucntions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def tsv2df(filepath):\n", | |
" with open(filepath, 'rb') as f:\n", | |
" df = pd.read_csv(f, sep='\\t', header=0, encoding='ISO-8859-1')\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Load the file and store into the database" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"db = SqliteHandler(DBNAME)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for folder in FOLDERS:\n", | |
" print('------ execute: ' + FOLDER + ' ------')\n", | |
" # filepaths\n", | |
" subfile = FILEPATH_SUB.format(FOLDER)\n", | |
" numfile = FILEPATH_NUM.format(FOLDER)\n", | |
" tagfile = FILEPATH_TAG.format(FOLDER)\n", | |
" \n", | |
" # load data\n", | |
" df_sub = tsv2df(subfile)\n", | |
" df_num = tsv2df(numfile)\n", | |
" df_tag = tsv2df(tagfile)\n", | |
" \n", | |
" # add a column indicating when the statement is filed\n", | |
" df_sub['file'] = FOLDER\n", | |
" df_num['file'] = FOLDER\n", | |
" df_tag['file'] = FOLDER\n", | |
" \n", | |
" # we found that sub file contains additional 2 columns only in 2016q2 and 2016q3.\n", | |
" # to keep consistence with other files , drop these columns\n", | |
" df.drop([Column Name or list],inplace=True,axis=1)\n", | |
" if 'floatdate' in list(df_sub.columns): df_sub.drop('floatdate',inplace=True,axis=1)\n", | |
" if 'pubfloatusd' in list(df_sub.columns): df_sub.drop('pubfloatusd',inplace=True,axis=1)\n", | |
" \n", | |
" # print data summary\n", | |
" print('sub records: ' + str(len(df_sub)))\n", | |
" print('num records: ' + str(len(df_num)))\n", | |
" print('tag records: ' + str(len(df_tag)))\n", | |
" \n", | |
" # store in database\n", | |
" db.df2table('sub', df_sub)\n", | |
" db.df2table('num', df_num)\n", | |
" db.df2table('tag', df_tag)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment