-
-
Save gaulinmp/8942295 to your computer and use it in GitHub Desktop.
Header file for 90% of the Notebook necessities.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# system/os/regex and basic math functions\n", | |
"import os\n", | |
"import re\n", | |
"import sys\n", | |
"import math\n", | |
"import json\n", | |
"import time\n", | |
"import string\n", | |
"import dateutil\n", | |
"import datetime as dt\n", | |
"from itertools import chain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Set logging level\n", | |
"import logging\n", | |
"try:\n", | |
" kwargs = {'level':getattr(logging, LOG_LEVEL)}\n", | |
"except NameError:\n", | |
" kwargs = {'level':logging.WARNING}\n", | |
" print('Set LOG_LEVEL=\"INFO\" before running the import file to get moar output.')\n", | |
"try:\n", | |
" kwargs['format'] = LOG_FORMAT\n", | |
"except NameError:\n", | |
" kwargs['format'] = \"%(levelname)s::%(message)s\"\n", | |
" print('Set LOG_FORMAT to change log format.')\n", | |
"\n", | |
"logging.basicConfig(**kwargs)\n", | |
"logger = logging.getLogger('notebook')\n", | |
"del kwargs\n", | |
"\n", | |
"import warnings\n", | |
"warnings.simplefilter(action='ignore', category=FutureWarning)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# IPython display convenience stuff\n", | |
"try:\n", | |
" from IPython.display import HTML, display, display_html, display_javascript\n", | |
" from IPython import __version__ as ipythonversion\n", | |
" import ipywidgets\n", | |
" print(\"IPython: {}\".format(ipythonversion))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # numpy for matrix algebra\n", | |
" import numpy as np\n", | |
" os.environ['NUMEXPR_MAX_THREADS'] = '20'\n", | |
" print(\"Numpy (np): {}\".format(np.version.full_version))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # scipy for probability distributions and some statistical tests\n", | |
" import scipy as sp\n", | |
" import scipy.stats as stats\n", | |
" print(\"Scipy (sp, stats): {}\".format(sp.version.full_version))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # pandas for data manipulation\n", | |
" import pandas as pd\n", | |
" print(\"Pandas (pd): {}\".format(pd.__version__))\n", | |
" \n", | |
" def fmt_float(float_in, rstrip0s=re.compile(r'\\.0+$')):\n", | |
" try:\n", | |
" return rstrip0s.sub('', '{0:,.{1}f}'.format(float_in, 3 - 3 * bool(abs(float_in) // 1000)))\n", | |
" except Exception:\n", | |
" return str(float_in)\n", | |
" pd.set_option('float_format', fmt_float)\n", | |
" pd.set_option('display.max_rows', 250)\n", | |
" pd.set_option('display.max_columns', 250)\n", | |
" pd.set_option('display.notebook_repr_html', True)\n", | |
" \n", | |
" def latex_format(num_in):\n", | |
" \"\"\"Format numbers for Latex tables\"\"\"\n", | |
" try:\n", | |
" num_in = float(num_in)\n", | |
" num_dig = np.log10(abs(num_in)) + 1\n", | |
" if num_in == 0:\n", | |
" return \"0\"\n", | |
" if num_dig >= 3:\n", | |
" return f\"{int(num_in):,d}\"\n", | |
" elif num_dig >= 1:\n", | |
" return f\"{num_in:2.1f}\"\n", | |
" return f\"{num_in:1.3f}\"\n", | |
" except ValueError:\n", | |
" return str(num_in)\n", | |
" \n", | |
" def S(df, cols=None, keep_dups=False):\n", | |
" \"\"\"S splits strings, and if called with a df input, interpolates variable names.\n", | |
" \n", | |
" Example::\n", | |
" S('gvkey datadate') # --> ['gvkey', 'datadate']\n", | |
" df.S('gvk* datad* num*') # --> ['gvkey', 'datadate', 'num_words', 'num_sentences']\n", | |
" \"\"\"\n", | |
" if isinstance(df, str):\n", | |
" cols = df\n", | |
" if isinstance(cols, str):\n", | |
" new_cols = []\n", | |
" for col in cols.split():\n", | |
" if '*' in col or '?' in col:\n", | |
" matcher = re.compile(r'\\b'+col.replace('*', '.*').replace('?', '.')+r'\\b', re.I)\n", | |
" new_cols.extend([c for c in df.columns if matcher.search(c)])\n", | |
" else:\n", | |
" new_cols.append(col)\n", | |
" cols = new_cols\n", | |
" return cols if keep_dups else list(dict(zip(cols, cols)))\n", | |
" \n", | |
" # monkeypatch C into DataFrame\n", | |
" pd.DataFrame.S = S\n", | |
" \n", | |
" def hugetable(df, soft_max=5000, hard_max=100_000):\n", | |
" max_rows = pd.options.display.max_rows\n", | |
" max_columns = pd.options.display.max_columns\n", | |
" pd.options.display.max_rows = min(soft_max, 100_000)\n", | |
" pd.options.display.max_columns = min(soft_max, 100_000)\n", | |
" display_html(df)\n", | |
" pd.options.display.max_rows = max_rows\n", | |
" pd.options.display.max_columns = max_columns\n", | |
"except (ImportError, ModuleNotFoundError):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # matplotlib for plotting and pyplot for MATLAB-style API\n", | |
" import matplotlib as mpl\n", | |
" import matplotlib.pyplot as plt\n", | |
" plt.rcParams['figure.figsize'] = (15, 5) \n", | |
" print(\"MatPlotLib (mpl, plt): {}\".format(mpl.__version__))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # Seaborn for pretty plotting\n", | |
" import seaborn as sns\n", | |
" print(\"Seaborn (sns): {}\".format(sns.__version__))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # Scikit Learn for more regressions\n", | |
" import sklearn as sk\n", | |
" print(\"Scikit-Learn (sk): {}\".format(sk.__version__))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # statsmodels for econometrics\n", | |
" import statsmodels.api as sm\n", | |
" import statsmodels.formula.api as smf\n", | |
" print(\"Statsmodels (sm,smf): {}\".format(sm.__version__))\n", | |
"except (ImportError, AttributeError):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # patsy for making formulas\n", | |
" import patsy as pt\n", | |
" print(\"Patsy (pt): {}\".format(pt.__version__))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # SQLAlchemy for relational db management\n", | |
" import sqlalchemy as sa\n", | |
" print(\"SQLAlchemy (sa): {}\".format(sa.__version__))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # Gensim for textual analysis\n", | |
" import gensim\n", | |
" print(\"Gensim: {}\".format(gensim.__version__))\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # TQDM for progress bar outputs\n", | |
" from tqdm.notebook import tqdm\n", | |
"except ImportError:\n", | |
" def tqdm(thing, *args, **kwargs):\n", | |
" return thing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # sas7bdat for reading SAS created databases\n", | |
" from sas7bdat import SAS7BDAT as SASdb\n", | |
" print(\"SAS8BDAT (SASdb): unknown version\")\n", | |
"\n", | |
" SAS_ZERO = dt.datetime(1960,1,1)\n", | |
" \n", | |
" def sas_date_to_datetime(df_col):\n", | |
" return pd.to_timedelta(df_col, unit='d') + SAS_ZERO\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" # BeautifulSoup for HTML things\n", | |
" from bs4 import BeautifulSoup\n", | |
"except ImportError:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" from pyedgar.utilities import edgarweb\n", | |
"except (ImportError, ModuleNotFoundError):\n", | |
" class _o_(object):\n", | |
" def edgar_links(*args, **kwargs):\n", | |
" return ''\n", | |
" edgarweb = _o_()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Some nice date constants\n", | |
"MIN_DATE = dt.datetime(1900, 1, 1)\n", | |
"MAX_DATE = dt.datetime.today()\n", | |
"TD_DAY = pd.Timedelta(days=1)\n", | |
"TD_YEAR = pd.Timedelta(days=1) * 365" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# print(\"linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession')\")\n", | |
"def linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession', return_df=False):\n", | |
" \"\"\"\n", | |
" Displays top rows of a dataframe, and includes\n", | |
" links to the HTML and FTP websites if CIK and Accession are found.\n", | |
" \"\"\"\n", | |
" if len(df) == 0:\n", | |
" display_html(df[fields or df.columns].assign(link='').to_html(), raw=True)\n", | |
" return\n", | |
" \n", | |
" w = pd.get_option('display.max_colwidth')\n", | |
" pd.set_option('display.max_colwidth', None)\n", | |
" \n", | |
" if fields is None:\n", | |
" fields = list(df.columns)\n", | |
" \n", | |
" dfn = df.head(n).copy() \n", | |
" \n", | |
" if cik in dfn.columns:\n", | |
" linkstr, i = 'links', 0\n", | |
" while linkstr in dfn.columns:\n", | |
" linkstr = 'links%d' % i\n", | |
" i += 1\n", | |
" dfn[linkstr] = dfn.apply(lambda row: edgarweb.edgar_links(row[cik], row[accession]), axis=1)\n", | |
" fields.append(linkstr)\n", | |
" \n", | |
" html = f\"<h4>{title}</h4>\" if title else ''\n", | |
" html += dfn[fields].to_html(escape=False, index=False, na_rep=\"\")\n", | |
" \n", | |
" display_html(html, raw=True)\n", | |
" pd.set_option('display.max_colwidth', w)\n", | |
" \n", | |
" if return_df: \n", | |
" return dfn" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# print(\"timehist(dtseries_or_df, time_variable='year', y_tic_number=4, x_tic_skip=0, *args, **kwargs)\")\n", | |
"def timehist(dtseries_or_df, time_variable='year',\n", | |
" y_tic_number=4, x_tic_skip=0,\n", | |
" width=.9, ax=None, skip_retick=None,\n", | |
" label=None, \n", | |
" *args, **kwargs):\n", | |
" \"\"\"\n", | |
" Historgam of observations per time period.\n", | |
" First tries: dtseries_or_df.dt.time_variable\n", | |
" Failing that, does dtseries_or_df.value_counts()\n", | |
" Sends args and kwargs to figure.\n", | |
" \"\"\"\n", | |
" if ax is not None and skip_retick is None:\n", | |
" skip_retick = True\n", | |
" skip_retick = skip_retick or False\n", | |
" \n", | |
" x_tic_skip += 1\n", | |
" \n", | |
" if not skip_retick:\n", | |
" sns.set_style('darkgrid')\n", | |
" sns.set_context('talk', rc={'patch.linewidth': 0, 'patch.edgecolor': 'k', 'patch.facecolor': 'k'})\n", | |
" \n", | |
" _d = dtseries_or_df\n", | |
" try:\n", | |
" _d = _d.dt.__getattribute__(time_variable)\n", | |
" except:\n", | |
" try:\n", | |
" _d = _d[time_variable]\n", | |
" except:\n", | |
" pass\n", | |
" _g = _d.value_counts().sort_index()\n", | |
" if len(_g) > 1000:\n", | |
" logger.error(\"ERROR: You are trying to plot something with too many levels. Don't do that.\")\n", | |
" return \n", | |
" \n", | |
" if ax is None:\n", | |
" if 'figsize' not in kwargs:\n", | |
" kwargs['figsize'] = (13,2)\n", | |
" plt.figure(*args, **kwargs)\n", | |
" ax = plt.gca()\n", | |
" # If ax is none, assume kwargs are for figure generation.\n", | |
" kwargs = {}\n", | |
" \n", | |
" ax.bar(_g.index, _g, width=width, label=label, **kwargs)\n", | |
" \n", | |
" if not skip_retick:\n", | |
" # Format and label X axis\n", | |
" ax.set_xlim(left=_g.index.min()-0.5, right=_g.index.max()+0.5)\n", | |
" _t = _g.index[::x_tic_skip]\n", | |
" ax.set_xticks(_t)\n", | |
" ax.set_xticklabels(map(str, _t), rotation='vertical')\n", | |
"\n", | |
" # Label Y Axis\n", | |
" tene = math.log10(_g.max())//1-1\n", | |
" topnum = math.ceil(_g.max() / 10**tene)\n", | |
" ax.set_yticks([(topnum * i // y_tic_number)*10**tene for i in range(y_tic_number, 0, -1)])\n", | |
" \n", | |
" return ax" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def savefig(file_name, fig, *args, default_root='.', make_name_unique=False, **kwargs):\n", | |
" \"\"\"\n", | |
" Save figure to .\n", | |
" \n", | |
" If no extension is added, PNG is assumed (at default 300 DPI).\n", | |
" \n", | |
" If make_name_unique flag is True, Y-M-D_H-M-S is added to filename.\n", | |
" \n", | |
" Returns file path that was created.\n", | |
" \"\"\"\n", | |
" _fname, _ext = os.path.splitext(file_name)\n", | |
" if not _ext:\n", | |
" _ext = '.png'\n", | |
" \n", | |
" file_path = os.path.join(default_root, _fname + _ext)\n", | |
" \n", | |
" if make_name_unique:\n", | |
" file_path = os.path.join(default_root, f\"{_fname}_{dt.date.today():%Y-%m-%d_%H-%M-%S}{_ext}\")\n", | |
" \n", | |
" default_kwargs = {\n", | |
" 'bbox_inches': 'tight',\n", | |
" 'pad_inches': 0.1,\n", | |
" 'transparent': True\n", | |
" }\n", | |
" \n", | |
" kwargs = {**default_kwargs, **kwargs}\n", | |
" \n", | |
" if _ext == '.png':\n", | |
" if 'dpi' not in kwargs:\n", | |
" kwargs['dpi'] = 300\n", | |
" \n", | |
" fig.savefig(file_path, *args, **kwargs)\n", | |
" \n", | |
" return file_path" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "scrapy", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.0" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": false, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": { | |
"height": "calc(100% - 180px)", | |
"left": "10px", | |
"top": "150px", | |
"width": "214.097px" | |
}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "503afef024fa0fe44fb4c28bd7916b7ce0d5c75122984c652c96c06c5c01e085" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment