Skip to content

Instantly share code, notes, and snippets.

@shotahorii
Created June 1, 2017 19:54
Show Gist options
  • Save shotahorii/d062eb69138030ba55805071430c0a8a to your computer and use it in GitHub Desktop.
Save shotahorii/d062eb69138030ba55805071430c0a8a to your computer and use it in GitHub Desktop.
Create ticker - cik lookup table
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cik - Ticker lookup"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"from os import listdir\n",
"from sqlitehandler import SqliteHandler\n",
"import cikTickerConverter as ctc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. Get cik numbers from all tickers in Nasdaq company lists"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Load data"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"db_nasdaq = SqliteHandler('nasdaq')"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_nasdaq = db_nasdaq.table2df(\"select * from companies;\")\n",
"df_nasdaq.columns = db_nasdaq.columns(\"companies\")"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tickers = df_nasdaq['Symbol'].tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Get cik numbers"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"FROM = 6000\n",
"N = 1000"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"targets = tickers[FROM:FROM+N]\n",
"filename = 'cik_'+str(FROM)+'-'+str(FROM+N-1)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n",
"100\n",
"150\n",
"200\n",
"250\n",
"300\n",
"350\n",
"400\n",
"450\n",
"500\n",
"550\n",
"600\n",
"650\n",
"700\n"
]
}
],
"source": [
"ciks = []\n",
"counter = 0\n",
"for ticker in targets:\n",
" counter += 1\n",
" if counter%50==0:print(counter)\n",
" c = ctc.ticker2cik(ticker)\n",
" ciks.append({'Symbol': ticker, 'cik': c})\n",
"\n",
"df_ciks = pd.DataFrame(ciks)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_ciks.to_csv('./data/cik/'+filename+'.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2. Get tickers from all ciks in cik list in SEC data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Load data"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"db_sec = SqliteHandler('sec')"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_sub = db_sec.table2df(\"select * from sub;\")\n",
"df_sub.columns = db_sec.columns(\"sub\")"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ciks = df_sub['cik'].drop_duplicates().tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Get tickers"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"FROM = 10000\n",
"N = 2000"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"targets = ciks[FROM:FROM+N]\n",
"filename = 'ticker_'+str(FROM)+'-'+str(FROM+N-1)"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n",
"100\n",
"150\n",
"200\n",
"250\n",
"300\n",
"350\n",
"400\n",
"450\n",
"500\n",
"550\n",
"600\n",
"650\n",
"700\n",
"750\n",
"800\n",
"850\n",
"900\n",
"950\n",
"1000\n",
"1050\n",
"1100\n",
"1150\n",
"1200\n"
]
}
],
"source": [
"tickers = []\n",
"counter = 0\n",
"for cik in targets:\n",
" counter += 1\n",
" if counter%50==0:print(counter)\n",
" t = ctc.cik2ticker(cik)\n",
" tickers.append({'Symbol': t, 'cik': cik})\n",
"\n",
"df_tickers = pd.DataFrame(tickers)"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_tickers.to_csv('./data/ticker/'+filename+'.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### 3. Integrating"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<b>cik</b>"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filenames_cik = list(filter(lambda x: x[0]!='.', listdir('./data/cik/')))"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_cikmaster = pd.DataFrame([])\n",
"for filename in filenames_cik:\n",
" with open('./data/cik/'+filename, 'rb') as f:\n",
" df_new = pd.read_csv(f, header=0)\n",
" if len(df_cikmaster)==0:\n",
" df_cikmaster = df_new\n",
" else:\n",
" df_cikmaster = pd.concat([df_cikmaster, df_new])\n",
"\n",
"df_cikmaster = df_cikmaster.reset_index(drop=True).rename(columns={'Symbol':'ticker'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<b>ticker</b>"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filenames_ticker = list(filter(lambda x: x[0]!='.', listdir('./data/ticker/')))"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_tickermaster = pd.DataFrame([])\n",
"for filename in filenames_ticker:\n",
" with open('./data/ticker/'+filename, 'rb') as f:\n",
" df_new = pd.read_csv(f, header=0)\n",
" if len(df_tickermaster)==0:\n",
" df_tickermaster = df_new\n",
" else:\n",
" df_tickermaster = pd.concat([df_tickermaster, df_new])\n",
"\n",
"df_tickermaster = df_tickermaster.reset_index(drop=True).rename(columns={'Symbol':'ticker'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### 4. Complementing - 1"
]
},
{
"cell_type": "code",
"execution_count": 216,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"failed_tickers = df_cikmaster[df_cikmaster['cik']==0]['ticker'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def standardise(ticker):\n",
" i1 = ticker.find('.')\n",
" i2 = ticker.find('^')\n",
" i3 = ticker.find('~')\n",
" il = [x for x in [i1,i2,i3] if x !=-1]\n",
" return ticker if len(il)==0 else ticker[:min(il)]"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"standardised = []\n",
"for ticker in failed_tickers:\n",
" standardised.append({'ticker': ticker, 'std_ticker': standardise(ticker)})\n",
"\n",
"df_standardised = pd.DataFrame(standardised)"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"targets = df_standardised[df_standardised['ticker']!=df_standardised['std_ticker']] \\\n",
"['std_ticker'].drop_duplicates().tolist()"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n",
"100\n",
"150\n",
"200\n",
"250\n"
]
}
],
"source": [
"ciks = []\n",
"counter = 0\n",
"for ticker in targets:\n",
" counter += 1\n",
" if counter%50==0:print(counter)\n",
" c = ctc.ticker2cik(ticker)\n",
" ciks.append({'Symbol': ticker, 'cik': c})\n",
"\n",
"df_ciks = pd.DataFrame(ciks)"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_ciks.to_csv('./data/ticker/cik_standardised.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 227,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('./data/ticker/cik_standardised.csv', 'rb') as f:\n",
" df_cikmaster_std = pd.read_csv(f, header=0)\n",
"\n",
"df_cikmaster_std = df_cikmaster_std.reset_index(drop=True).rename(columns={'Symbol':'std_ticker'})"
]
},
{
"cell_type": "code",
"execution_count": 233,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"df_standardised_master = \\\n",
"pd.merge(df_standardised, df_cikmaster_std, how='left', on='std_ticker').fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 242,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"df_ciknew = df_standardised_master[df_standardised_master['cik']!=0].drop('std_ticker', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_ciknewmaster_pre = pd.concat([df_cikmaster[df_cikmaster['cik']!=0], df_ciknew])"
]
},
{
"cell_type": "code",
"execution_count": 372,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_ciknewmaster_l = []\n",
"for index, row in df_ciknewmaster_pre.iterrows():\n",
" df_ciknewmaster_l.append({'ticker':row['ticker'], 'cik': str(int(row['cik']))})\n",
"\n",
"df_ciknewmaster = pd.DataFrame(df_ciknewmaster_l)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### 4. Complementing - 2"
]
},
{
"cell_type": "code",
"execution_count": 373,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"remains = df_standardised_master[df_standardised_master['cik']==0].drop('cik', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 374,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1225"
]
},
"execution_count": 374,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(remains)"
]
},
{
"cell_type": "code",
"execution_count": 375,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"obtainedTickers = df_tickermaster[df_tickermaster['ticker']!='0']\n",
"\n",
"temp = {}\n",
"for index, row in obtainedTickers.iterrows():\n",
" if row['ticker'] in temp.keys():\n",
" temp[row['ticker']] = temp[row['ticker']] + '|' + str(row['cik'])\n",
" else:\n",
" temp[row['ticker']] = str(row['cik'])\n",
"\n",
"temp2 = []\n",
"for key in temp.keys():\n",
" temp2.append({'std_ticker': key, 'cik': temp[key]})\n",
" \n",
"df_obtained = pd.DataFrame(temp2)"
]
},
{
"cell_type": "code",
"execution_count": 376,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_obtained_new = pd.merge(remains, df_obtained, how='left', on='std_ticker')"
]
},
{
"cell_type": "code",
"execution_count": 377,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_ciknewmaster2 = \\\n",
"pd.concat([df_ciknewmaster, df_obtained_new[df_obtained_new['cik'].isnull()==False]\n",
" .drop('std_ticker',axis=1)])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### 5. Complementing - 3"
]
},
{
"cell_type": "code",
"execution_count": 378,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"remains2 = df_obtained_new[df_obtained_new['cik'].isnull()==True].drop('cik', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 379,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('./data/dan.vonkohorn.com/cik-ticker-mappings.json') as f:\n",
" danlist = pd.DataFrame(json.load(f))\n",
" \n",
"temp = {}\n",
"for index, row in danlist.iterrows():\n",
" if row['ticker'] in temp.keys():\n",
" temp[row['ticker']] = temp[row['ticker']] + '|' + str(row['cik'])\n",
" else:\n",
" temp[row['ticker']] = str(row['cik'])\n",
"\n",
"temp2 = []\n",
"for key in temp.keys():\n",
" temp2.append({'std_ticker': key, 'cik': temp[key]})\n",
" \n",
"df_danlist = pd.DataFrame(temp2)"
]
},
{
"cell_type": "code",
"execution_count": 380,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"matched_danlist = pd.merge(remains2, df_danlist, how='left', on='std_ticker') \\\n",
".drop('std_ticker', axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### 6. Complementing - 4"
]
},
{
"cell_type": "code",
"execution_count": 387,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_master = pd.concat([df_ciknewmaster2, matched_danlist]).reset_index(drop=True).fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 397,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_master[df_master.apply(lambda x: '|' in str(x['cik']), axis=1)].to_csv('to_be_corrected.csv')\n",
"# then manually check these ciks"
]
},
{
"cell_type": "code",
"execution_count": 398,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_master_pre = df_master[df_master.apply(lambda x: '|' not in str(x['cik']), axis=1)]"
]
},
{
"cell_type": "code",
"execution_count": 403,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_corrected = pd.read_csv('./ticker_corrected.csv')"
]
},
{
"cell_type": "code",
"execution_count": 407,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_master_corrected = pd.concat([df_master_pre, df_corrected]).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### 7. Store"
]
},
{
"cell_type": "code",
"execution_count": 410,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"db_lookup = SqliteHandler('lookup')"
]
},
{
"cell_type": "code",
"execution_count": 411,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"db_lookup.df2table('tickercik', df_master_corrected)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment