Created
June 1, 2017 19:54
-
-
Save shotahorii/d062eb69138030ba55805071430c0a8a to your computer and use it in GitHub Desktop.
Create ticker - cik lookup table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Cik - Ticker lookup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 101, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "import pandas as pd\n", | |
| "from os import listdir\n", | |
| "from sqlitehandler import SqliteHandler\n", | |
| "import cikTickerConverter as ctc" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### 1. Get cik numbers from all tickers in Nasdaq company lists" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### Load data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 102, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "db_nasdaq = SqliteHandler('nasdaq')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 104, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_nasdaq = db_nasdaq.table2df(\"select * from companies;\")\n", | |
| "df_nasdaq.columns = db_nasdaq.columns(\"companies\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 106, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "tickers = df_nasdaq['Symbol'].tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### Get cik numbers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 59, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "FROM = 6000\n", | |
| "N = 1000" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 60, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "targets = tickers[FROM:FROM+N]\n", | |
| "filename = 'cik_'+str(FROM)+'-'+str(FROM+N-1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 61, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "50\n", | |
| "100\n", | |
| "150\n", | |
| "200\n", | |
| "250\n", | |
| "300\n", | |
| "350\n", | |
| "400\n", | |
| "450\n", | |
| "500\n", | |
| "550\n", | |
| "600\n", | |
| "650\n", | |
| "700\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "ciks = []\n", | |
| "counter = 0\n", | |
| "for ticker in targets:\n", | |
| " counter += 1\n", | |
| " if counter%50==0:print(counter)\n", | |
| " c = ctc.ticker2cik(ticker)\n", | |
| " ciks.append({'Symbol': ticker, 'cik': c})\n", | |
| "\n", | |
| "df_ciks = pd.DataFrame(ciks)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 62, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_ciks.to_csv('./data/cik/'+filename+'.csv', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### 2. Get tickers from all ciks in cik list in SEC data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### Load data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 111, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "db_sec = SqliteHandler('sec')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 112, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_sub = db_sec.table2df(\"select * from sub;\")\n", | |
| "df_sub.columns = db_sec.columns(\"sub\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 114, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "ciks = df_sub['cik'].drop_duplicates().tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### Get tickers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 154, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "FROM = 10000\n", | |
| "N = 2000" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 155, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "targets = ciks[FROM:FROM+N]\n", | |
| "filename = 'ticker_'+str(FROM)+'-'+str(FROM+N-1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 156, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "50\n", | |
| "100\n", | |
| "150\n", | |
| "200\n", | |
| "250\n", | |
| "300\n", | |
| "350\n", | |
| "400\n", | |
| "450\n", | |
| "500\n", | |
| "550\n", | |
| "600\n", | |
| "650\n", | |
| "700\n", | |
| "750\n", | |
| "800\n", | |
| "850\n", | |
| "900\n", | |
| "950\n", | |
| "1000\n", | |
| "1050\n", | |
| "1100\n", | |
| "1150\n", | |
| "1200\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "tickers = []\n", | |
| "counter = 0\n", | |
| "for cik in targets:\n", | |
| " counter += 1\n", | |
| " if counter%50==0:print(counter)\n", | |
| " t = ctc.cik2ticker(cik)\n", | |
| " tickers.append({'Symbol': t, 'cik': cik})\n", | |
| "\n", | |
| "df_tickers = pd.DataFrame(tickers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 157, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_tickers.to_csv('./data/ticker/'+filename+'.csv', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### 3. Integrating" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "<b>cik</b>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 162, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "filenames_cik = list(filter(lambda x: x[0]!='.', listdir('./data/cik/')))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 163, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_cikmaster = pd.DataFrame([])\n", | |
| "for filename in filenames_cik:\n", | |
| " with open('./data/cik/'+filename, 'rb') as f:\n", | |
| " df_new = pd.read_csv(f, header=0)\n", | |
| " if len(df_cikmaster)==0:\n", | |
| " df_cikmaster = df_new\n", | |
| " else:\n", | |
| " df_cikmaster = pd.concat([df_cikmaster, df_new])\n", | |
| "\n", | |
| "df_cikmaster = df_cikmaster.reset_index(drop=True).rename(columns={'Symbol':'ticker'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "<b>ticker</b>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 165, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "filenames_ticker = list(filter(lambda x: x[0]!='.', listdir('./data/ticker/')))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 168, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_tickermaster = pd.DataFrame([])\n", | |
| "for filename in filenames_ticker:\n", | |
| " with open('./data/ticker/'+filename, 'rb') as f:\n", | |
| " df_new = pd.read_csv(f, header=0)\n", | |
| " if len(df_tickermaster)==0:\n", | |
| " df_tickermaster = df_new\n", | |
| " else:\n", | |
| " df_tickermaster = pd.concat([df_tickermaster, df_new])\n", | |
| "\n", | |
| "df_tickermaster = df_tickermaster.reset_index(drop=True).rename(columns={'Symbol':'ticker'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### 4. Complementing - 1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 216, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "failed_tickers = df_cikmaster[df_cikmaster['cik']==0]['ticker'].tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 217, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def standardise(ticker):\n", | |
| " i1 = ticker.find('.')\n", | |
| " i2 = ticker.find('^')\n", | |
| " i3 = ticker.find('~')\n", | |
| " il = [x for x in [i1,i2,i3] if x !=-1]\n", | |
| " return ticker if len(il)==0 else ticker[:min(il)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 218, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "standardised = []\n", | |
| "for ticker in failed_tickers:\n", | |
| " standardised.append({'ticker': ticker, 'std_ticker': standardise(ticker)})\n", | |
| "\n", | |
| "df_standardised = pd.DataFrame(standardised)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 219, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "targets = df_standardised[df_standardised['ticker']!=df_standardised['std_ticker']] \\\n", | |
| "['std_ticker'].drop_duplicates().tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 220, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "50\n", | |
| "100\n", | |
| "150\n", | |
| "200\n", | |
| "250\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "ciks = []\n", | |
| "counter = 0\n", | |
| "for ticker in targets:\n", | |
| " counter += 1\n", | |
| " if counter%50==0:print(counter)\n", | |
| " c = ctc.ticker2cik(ticker)\n", | |
| " ciks.append({'Symbol': ticker, 'cik': c})\n", | |
| "\n", | |
| "df_ciks = pd.DataFrame(ciks)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 221, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_ciks.to_csv('./data/ticker/cik_standardised.csv', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 227, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "with open('./data/ticker/cik_standardised.csv', 'rb') as f:\n", | |
| " df_cikmaster_std = pd.read_csv(f, header=0)\n", | |
| "\n", | |
| "df_cikmaster_std = df_cikmaster_std.reset_index(drop=True).rename(columns={'Symbol':'std_ticker'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 233, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_standardised_master = \\\n", | |
| "pd.merge(df_standardised, df_cikmaster_std, how='left', on='std_ticker').fillna(0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 242, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_ciknew = df_standardised_master[df_standardised_master['cik']!=0].drop('std_ticker', axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 359, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_ciknewmaster_pre = pd.concat([df_cikmaster[df_cikmaster['cik']!=0], df_ciknew])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 372, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_ciknewmaster_l = []\n", | |
| "for index, row in df_ciknewmaster_pre.iterrows():\n", | |
| " df_ciknewmaster_l.append({'ticker':row['ticker'], 'cik': str(int(row['cik']))})\n", | |
| "\n", | |
| "df_ciknewmaster = pd.DataFrame(df_ciknewmaster_l)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### 4. Complementing - 2" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 373, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "remains = df_standardised_master[df_standardised_master['cik']==0].drop('cik', axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 374, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "1225" | |
| ] | |
| }, | |
| "execution_count": 374, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(remains)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 375, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "obtainedTickers = df_tickermaster[df_tickermaster['ticker']!='0']\n", | |
| "\n", | |
| "temp = {}\n", | |
| "for index, row in obtainedTickers.iterrows():\n", | |
| " if row['ticker'] in temp.keys():\n", | |
| " temp[row['ticker']] = temp[row['ticker']] + '|' + str(row['cik'])\n", | |
| " else:\n", | |
| " temp[row['ticker']] = str(row['cik'])\n", | |
| "\n", | |
| "temp2 = []\n", | |
| "for key in temp.keys():\n", | |
| " temp2.append({'std_ticker': key, 'cik': temp[key]})\n", | |
| " \n", | |
| "df_obtained = pd.DataFrame(temp2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 376, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_obtained_new = pd.merge(remains, df_obtained, how='left', on='std_ticker')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 377, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_ciknewmaster2 = \\\n", | |
| "pd.concat([df_ciknewmaster, df_obtained_new[df_obtained_new['cik'].isnull()==False]\n", | |
| " .drop('std_ticker',axis=1)])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### 5. Complementing - 3" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 378, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "remains2 = df_obtained_new[df_obtained_new['cik'].isnull()==True].drop('cik', axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 379, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "with open('./data/dan.vonkohorn.com/cik-ticker-mappings.json') as f:\n", | |
| " danlist = pd.DataFrame(json.load(f))\n", | |
| " \n", | |
| "temp = {}\n", | |
| "for index, row in danlist.iterrows():\n", | |
| " if row['ticker'] in temp.keys():\n", | |
| " temp[row['ticker']] = temp[row['ticker']] + '|' + str(row['cik'])\n", | |
| " else:\n", | |
| " temp[row['ticker']] = str(row['cik'])\n", | |
| "\n", | |
| "temp2 = []\n", | |
| "for key in temp.keys():\n", | |
| " temp2.append({'std_ticker': key, 'cik': temp[key]})\n", | |
| " \n", | |
| "df_danlist = pd.DataFrame(temp2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 380, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "matched_danlist = pd.merge(remains2, df_danlist, how='left', on='std_ticker') \\\n", | |
| ".drop('std_ticker', axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### 6. Complementing - 4" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 387, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_master = pd.concat([df_ciknewmaster2, matched_danlist]).reset_index(drop=True).fillna(0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 397, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_master[df_master.apply(lambda x: '|' in str(x['cik']), axis=1)].to_csv('to_be_corrected.csv')\n", | |
| "# then manually check these ciks" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 398, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_master_pre = df_master[df_master.apply(lambda x: '|' not in str(x['cik']), axis=1)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 403, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_corrected = pd.read_csv('./ticker_corrected.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 407, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_master_corrected = pd.concat([df_master_pre, df_corrected]).reset_index(drop=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "###### 7. Store" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 410, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "db_lookup = SqliteHandler('lookup')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 411, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "db_lookup.df2table('tickercik', df_master_corrected)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment