Created
June 1, 2017 19:54
-
-
Save shotahorii/d062eb69138030ba55805071430c0a8a to your computer and use it in GitHub Desktop.
Create ticker - cik lookup table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Cik - Ticker lookup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import pandas as pd\n", | |
"from os import listdir\n", | |
"from sqlitehandler import SqliteHandler\n", | |
"import cikTickerConverter as ctc" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1. Get cik numbers from all tickers in Nasdaq company lists" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### Load data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"db_nasdaq = SqliteHandler('nasdaq')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_nasdaq = db_nasdaq.table2df(\"select * from companies;\")\n", | |
"df_nasdaq.columns = db_nasdaq.columns(\"companies\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tickers = df_nasdaq['Symbol'].tolist()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### Get cik numbers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"FROM = 6000\n", | |
"N = 1000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"targets = tickers[FROM:FROM+N]\n", | |
"filename = 'cik_'+str(FROM)+'-'+str(FROM+N-1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"50\n", | |
"100\n", | |
"150\n", | |
"200\n", | |
"250\n", | |
"300\n", | |
"350\n", | |
"400\n", | |
"450\n", | |
"500\n", | |
"550\n", | |
"600\n", | |
"650\n", | |
"700\n" | |
] | |
} | |
], | |
"source": [ | |
"ciks = []\n", | |
"counter = 0\n", | |
"for ticker in targets:\n", | |
" counter += 1\n", | |
" if counter%50==0:print(counter)\n", | |
" c = ctc.ticker2cik(ticker)\n", | |
" ciks.append({'Symbol': ticker, 'cik': c})\n", | |
"\n", | |
"df_ciks = pd.DataFrame(ciks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_ciks.to_csv('./data/cik/'+filename+'.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 2. Get tickers from all ciks in cik list in SEC data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### Load data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"db_sec = SqliteHandler('sec')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_sub = db_sec.table2df(\"select * from sub;\")\n", | |
"df_sub.columns = db_sec.columns(\"sub\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ciks = df_sub['cik'].drop_duplicates().tolist()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### Get tickers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 154, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"FROM = 10000\n", | |
"N = 2000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 155, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"targets = ciks[FROM:FROM+N]\n", | |
"filename = 'ticker_'+str(FROM)+'-'+str(FROM+N-1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 156, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"50\n", | |
"100\n", | |
"150\n", | |
"200\n", | |
"250\n", | |
"300\n", | |
"350\n", | |
"400\n", | |
"450\n", | |
"500\n", | |
"550\n", | |
"600\n", | |
"650\n", | |
"700\n", | |
"750\n", | |
"800\n", | |
"850\n", | |
"900\n", | |
"950\n", | |
"1000\n", | |
"1050\n", | |
"1100\n", | |
"1150\n", | |
"1200\n" | |
] | |
} | |
], | |
"source": [ | |
"tickers = []\n", | |
"counter = 0\n", | |
"for cik in targets:\n", | |
" counter += 1\n", | |
" if counter%50==0:print(counter)\n", | |
" t = ctc.cik2ticker(cik)\n", | |
" tickers.append({'Symbol': t, 'cik': cik})\n", | |
"\n", | |
"df_tickers = pd.DataFrame(tickers)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 157, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_tickers.to_csv('./data/ticker/'+filename+'.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### 3. Integrating" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<b>cik</b>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 162, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"filenames_cik = list(filter(lambda x: x[0]!='.', listdir('./data/cik/')))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 163, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_cikmaster = pd.DataFrame([])\n", | |
"for filename in filenames_cik:\n", | |
" with open('./data/cik/'+filename, 'rb') as f:\n", | |
" df_new = pd.read_csv(f, header=0)\n", | |
" if len(df_cikmaster)==0:\n", | |
" df_cikmaster = df_new\n", | |
" else:\n", | |
" df_cikmaster = pd.concat([df_cikmaster, df_new])\n", | |
"\n", | |
"df_cikmaster = df_cikmaster.reset_index(drop=True).rename(columns={'Symbol':'ticker'})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<b>ticker</b>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 165, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"filenames_ticker = list(filter(lambda x: x[0]!='.', listdir('./data/ticker/')))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 168, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_tickermaster = pd.DataFrame([])\n", | |
"for filename in filenames_ticker:\n", | |
" with open('./data/ticker/'+filename, 'rb') as f:\n", | |
" df_new = pd.read_csv(f, header=0)\n", | |
" if len(df_tickermaster)==0:\n", | |
" df_tickermaster = df_new\n", | |
" else:\n", | |
" df_tickermaster = pd.concat([df_tickermaster, df_new])\n", | |
"\n", | |
"df_tickermaster = df_tickermaster.reset_index(drop=True).rename(columns={'Symbol':'ticker'})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### 4. Complementing - 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 216, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"failed_tickers = df_cikmaster[df_cikmaster['cik']==0]['ticker'].tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 217, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def standardise(ticker):\n", | |
" i1 = ticker.find('.')\n", | |
" i2 = ticker.find('^')\n", | |
" i3 = ticker.find('~')\n", | |
" il = [x for x in [i1,i2,i3] if x !=-1]\n", | |
" return ticker if len(il)==0 else ticker[:min(il)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 218, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"standardised = []\n", | |
"for ticker in failed_tickers:\n", | |
" standardised.append({'ticker': ticker, 'std_ticker': standardise(ticker)})\n", | |
"\n", | |
"df_standardised = pd.DataFrame(standardised)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 219, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"targets = df_standardised[df_standardised['ticker']!=df_standardised['std_ticker']] \\\n", | |
"['std_ticker'].drop_duplicates().tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 220, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"50\n", | |
"100\n", | |
"150\n", | |
"200\n", | |
"250\n" | |
] | |
} | |
], | |
"source": [ | |
"ciks = []\n", | |
"counter = 0\n", | |
"for ticker in targets:\n", | |
" counter += 1\n", | |
" if counter%50==0:print(counter)\n", | |
" c = ctc.ticker2cik(ticker)\n", | |
" ciks.append({'Symbol': ticker, 'cik': c})\n", | |
"\n", | |
"df_ciks = pd.DataFrame(ciks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 221, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_ciks.to_csv('./data/ticker/cik_standardised.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 227, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('./data/ticker/cik_standardised.csv', 'rb') as f:\n", | |
" df_cikmaster_std = pd.read_csv(f, header=0)\n", | |
"\n", | |
"df_cikmaster_std = df_cikmaster_std.reset_index(drop=True).rename(columns={'Symbol':'std_ticker'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 233, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_standardised_master = \\\n", | |
"pd.merge(df_standardised, df_cikmaster_std, how='left', on='std_ticker').fillna(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 242, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_ciknew = df_standardised_master[df_standardised_master['cik']!=0].drop('std_ticker', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 359, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_ciknewmaster_pre = pd.concat([df_cikmaster[df_cikmaster['cik']!=0], df_ciknew])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 372, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_ciknewmaster_l = []\n", | |
"for index, row in df_ciknewmaster_pre.iterrows():\n", | |
" df_ciknewmaster_l.append({'ticker':row['ticker'], 'cik': str(int(row['cik']))})\n", | |
"\n", | |
"df_ciknewmaster = pd.DataFrame(df_ciknewmaster_l)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### 4. Complementing - 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 373, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"remains = df_standardised_master[df_standardised_master['cik']==0].drop('cik', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 374, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1225" | |
] | |
}, | |
"execution_count": 374, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(remains)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 375, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"obtainedTickers = df_tickermaster[df_tickermaster['ticker']!='0']\n", | |
"\n", | |
"temp = {}\n", | |
"for index, row in obtainedTickers.iterrows():\n", | |
" if row['ticker'] in temp.keys():\n", | |
" temp[row['ticker']] = temp[row['ticker']] + '|' + str(row['cik'])\n", | |
" else:\n", | |
" temp[row['ticker']] = str(row['cik'])\n", | |
"\n", | |
"temp2 = []\n", | |
"for key in temp.keys():\n", | |
" temp2.append({'std_ticker': key, 'cik': temp[key]})\n", | |
" \n", | |
"df_obtained = pd.DataFrame(temp2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 376, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_obtained_new = pd.merge(remains, df_obtained, how='left', on='std_ticker')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 377, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_ciknewmaster2 = \\\n", | |
"pd.concat([df_ciknewmaster, df_obtained_new[df_obtained_new['cik'].isnull()==False]\n", | |
" .drop('std_ticker',axis=1)])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### 5. Complementing - 3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 378, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"remains2 = df_obtained_new[df_obtained_new['cik'].isnull()==True].drop('cik', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 379, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('./data/dan.vonkohorn.com/cik-ticker-mappings.json') as f:\n", | |
" danlist = pd.DataFrame(json.load(f))\n", | |
" \n", | |
"temp = {}\n", | |
"for index, row in danlist.iterrows():\n", | |
" if row['ticker'] in temp.keys():\n", | |
" temp[row['ticker']] = temp[row['ticker']] + '|' + str(row['cik'])\n", | |
" else:\n", | |
" temp[row['ticker']] = str(row['cik'])\n", | |
"\n", | |
"temp2 = []\n", | |
"for key in temp.keys():\n", | |
" temp2.append({'std_ticker': key, 'cik': temp[key]})\n", | |
" \n", | |
"df_danlist = pd.DataFrame(temp2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 380, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"matched_danlist = pd.merge(remains2, df_danlist, how='left', on='std_ticker') \\\n", | |
".drop('std_ticker', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### 6. Complementing - 4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 387, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_master = pd.concat([df_ciknewmaster2, matched_danlist]).reset_index(drop=True).fillna(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 397, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_master[df_master.apply(lambda x: '|' in str(x['cik']), axis=1)].to_csv('to_be_corrected.csv')\n", | |
"# then manually check these ciks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 398, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_master_pre = df_master[df_master.apply(lambda x: '|' not in str(x['cik']), axis=1)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 403, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_corrected = pd.read_csv('./ticker_corrected.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 407, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_master_corrected = pd.concat([df_master_pre, df_corrected]).reset_index(drop=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###### 7. Store" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 410, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"db_lookup = SqliteHandler('lookup')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 411, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"db_lookup.df2table('tickercik', df_master_corrected)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment