gaulinmp · February 10, 2023 19:57
diff --git a/imports.ipynb b/imports.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# system/os/regex and basic math functions\n",
    "import os\n",
    "import re\n",
    "import sys\n",
    "import math\n",
    "import json\n",
    "import time\n",
    "import string\n",
    "import dateutil\n",
    "import datetime as dt\n",
    "from itertools import chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set logging level\n",
    "import logging\n",
    "try:\n",
    "    kwargs = {'level':getattr(logging, LOG_LEVEL)}\n",
    "except NameError:\n",
    "    kwargs = {'level':logging.WARNING}\n",
    "    print('Set LOG_LEVEL=\"INFO\" before running the import file to get moar output.')\n",
    "try:\n",
    "    kwargs['format'] = LOG_FORMAT\n",
    "except NameError:\n",
    "    kwargs['format'] = \"%(levelname)s::%(message)s\"\n",
    "    print('Set LOG_FORMAT to change log format.')\n",
    "\n",
    "logging.basicConfig(**kwargs)\n",
    "logger = logging.getLogger('notebook')\n",
    "del kwargs\n",
    "\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# IPython display convenience stuff\n",
    "try:\n",
    "    from IPython.display import HTML, display, display_html, display_javascript\n",
    "    from IPython import __version__ as ipythonversion\n",
    "    import ipywidgets\n",
    "    print(\"IPython: {}\".format(ipythonversion))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # numpy for matrix algebra\n",
    "    import numpy as np\n",
    "    os.environ['NUMEXPR_MAX_THREADS'] = '20'\n",
    "    print(\"Numpy (np): {}\".format(np.version.full_version))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # scipy for probability distributions and some statistical tests\n",
    "    import scipy as sp\n",
    "    import scipy.stats as stats\n",
    "    print(\"Scipy (sp, stats): {}\".format(sp.version.full_version))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # pandas for data manipulation\n",
    "    import pandas as pd\n",
    "    print(\"Pandas (pd): {}\".format(pd.__version__))\n",
    "    \n",
    "    def fmt_float(float_in, rstrip0s=re.compile(r'\\.0+$')):\n",
    "        try:\n",
    "            return rstrip0s.sub('', '{0:,.{1}f}'.format(float_in, 3 - 3 * bool(abs(float_in) // 1000)))\n",
    "        except Exception:\n",
    "            return str(float_in)\n",
    "    pd.set_option('float_format', fmt_float)\n",
    "    pd.set_option('display.max_rows', 250)\n",
    "    pd.set_option('display.max_columns', 250)\n",
    "    pd.set_option('display.notebook_repr_html', True)\n",
    "    \n",
    "    def latex_format(num_in):\n",
    "        \"\"\"Format numbers for Latex tables\"\"\"\n",
    "        try:\n",
    "            num_in = float(num_in)\n",
    "            num_dig = np.log10(abs(num_in)) + 1\n",
    "            if num_in == 0:\n",
    "                return \"0\"\n",
    "            if num_dig >= 3:\n",
    "                return f\"{int(num_in):,d}\"\n",
    "            elif num_dig >= 1:\n",
    "                return f\"{num_in:2.1f}\"\n",
    "            return f\"{num_in:1.3f}\"\n",
    "        except ValueError:\n",
    "            return str(num_in)\n",
    "    \n",
    "    def S(df, cols=None, keep_dups=False):\n",
    "        \"\"\"S splits strings, and if called with a df input, interpolates variable names.\n",
    "        \n",
    "        Example::\n",
    "            S('gvkey datadate') # --> ['gvkey', 'datadate']\n",
    "            df.S('gvk* datad* num*') # --> ['gvkey', 'datadate', 'num_words', 'num_sentences']\n",
    "        \"\"\"\n",
    "        if isinstance(df, str):\n",
    "            cols = df\n",
    "        if isinstance(cols, str):\n",
    "            new_cols = []\n",
    "            for col in cols.split():\n",
    "                if '*' in col or '?' in col:\n",
    "                    matcher = re.compile(r'\\b'+col.replace('*', '.*').replace('?', '.')+r'\\b', re.I)\n",
    "                    new_cols.extend([c for c in df.columns if matcher.search(c)])\n",
    "                else:\n",
    "                    new_cols.append(col)\n",
    "            cols = new_cols\n",
    "        return cols if keep_dups else list(dict(zip(cols, cols)))\n",
    "    \n",
    "    # monkeypatch C into DataFrame\n",
    "    pd.DataFrame.S = S\n",
    "    \n",
    "    def hugetable(df, soft_max=5000, hard_max=100_000):\n",
    "        max_rows = pd.options.display.max_rows\n",
    "        max_columns = pd.options.display.max_columns\n",
    "        pd.options.display.max_rows = min(soft_max, 100_000)\n",
    "        pd.options.display.max_columns = min(soft_max, 100_000)\n",
    "        display_html(df)\n",
    "        pd.options.display.max_rows = max_rows\n",
    "        pd.options.display.max_columns = max_columns\n",
    "except (ImportError, ModuleNotFoundError):\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # matplotlib for plotting and pyplot for MATLAB-style API\n",
    "    import matplotlib as mpl\n",
    "    import matplotlib.pyplot as plt\n",
    "    plt.rcParams['figure.figsize'] = (15, 5) \n",
    "    print(\"MatPlotLib (mpl, plt): {}\".format(mpl.__version__))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # Seaborn for pretty plotting\n",
    "    import seaborn as sns\n",
    "    print(\"Seaborn (sns): {}\".format(sns.__version__))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # Scikit Learn for more regressions\n",
    "    import sklearn as sk\n",
    "    print(\"Scikit-Learn (sk): {}\".format(sk.__version__))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # statsmodels for econometrics\n",
    "    import statsmodels.api as sm\n",
    "    import statsmodels.formula.api as smf\n",
    "    print(\"Statsmodels (sm,smf): {}\".format(sm.__version__))\n",
    "except (ImportError, AttributeError):\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # patsy for making formulas\n",
    "    import patsy as pt\n",
    "    print(\"Patsy (pt): {}\".format(pt.__version__))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # SQLAlchemy for relational db management\n",
    "    import sqlalchemy as sa\n",
    "    print(\"SQLAlchemy (sa): {}\".format(sa.__version__))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # Gensim for textual analysis\n",
    "    import gensim\n",
    "    print(\"Gensim: {}\".format(gensim.__version__))\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # TQDM for progress bar outputs\n",
    "    from tqdm.notebook import tqdm\n",
    "except ImportError:\n",
    "    def tqdm(thing, *args, **kwargs):\n",
    "        return thing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # sas7bdat for reading SAS created databases\n",
    "    from sas7bdat import SAS7BDAT as SASdb\n",
    "    print(\"SAS8BDAT (SASdb): unknown version\")\n",
    "\n",
    "    SAS_ZERO = dt.datetime(1960,1,1)\n",
    "    \n",
    "    def sas_date_to_datetime(df_col):\n",
    "        return pd.to_timedelta(df_col, unit='d') + SAS_ZERO\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # BeautifulSoup for HTML things\n",
    "    from bs4 import BeautifulSoup\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    from pyedgar.utilities import edgarweb\n",
    "except (ImportError, ModuleNotFoundError):\n",
    "    class _o_(object):\n",
    "        def edgar_links(*args, **kwargs):\n",
    "            return ''\n",
    "    edgarweb = _o_()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Some nice date constants\n",
    "MIN_DATE = dt.datetime(1900, 1, 1)\n",
    "MAX_DATE = dt.datetime.today()\n",
    "TD_DAY = pd.Timedelta(days=1)\n",
    "TD_YEAR = pd.Timedelta(days=1) * 365"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(\"linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession')\")\n",
    "def linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession', return_df=False):\n",
    "    \"\"\"\n",
    "    Displays top rows of a dataframe, and includes\n",
    "    links to the HTML and FTP websites if CIK and Accession are found.\n",
    "    \"\"\"\n",
    "    if len(df) == 0:\n",
    "        display_html(df[fields or df.columns].assign(link='').to_html(), raw=True)\n",
    "        return\n",
    "        \n",
    "    w = pd.get_option('display.max_colwidth')\n",
    "    pd.set_option('display.max_colwidth', None)\n",
    "    \n",
    "    if fields is None:\n",
    "        fields = list(df.columns)\n",
    "    \n",
    "    dfn = df.head(n).copy()        \n",
    "    \n",
    "    if cik in dfn.columns:\n",
    "        linkstr, i = 'links', 0\n",
    "        while linkstr in dfn.columns:\n",
    "            linkstr = 'links%d' % i\n",
    "            i += 1\n",
    "        dfn[linkstr] = dfn.apply(lambda row: edgarweb.edgar_links(row[cik], row[accession]), axis=1)\n",
    "        fields.append(linkstr)\n",
    "    \n",
    "    html = f\"<h4>{title}</h4>\" if title else ''\n",
    "    html += dfn[fields].to_html(escape=False, index=False, na_rep=\"\")\n",
    "    \n",
    "    display_html(html, raw=True)\n",
    "    pd.set_option('display.max_colwidth', w)\n",
    "    \n",
    "    if return_df: \n",
    "        return dfn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(\"timehist(dtseries_or_df, time_variable='year', y_tic_number=4, x_tic_skip=0, *args, **kwargs)\")\n",
    "def timehist(dtseries_or_df, time_variable='year',\n",
    "             y_tic_number=4, x_tic_skip=0,\n",
    "             width=.9, ax=None, skip_retick=None,\n",
    "             label=None, \n",
    "             *args, **kwargs):\n",
    "    \"\"\"\n",
    "    Historgam of observations per time period.\n",
    "    First tries: dtseries_or_df.dt.time_variable\n",
    "    Failing that, does dtseries_or_df.value_counts()\n",
    "    Sends args and kwargs to figure.\n",
    "    \"\"\"\n",
    "    if ax is not None and skip_retick is None:\n",
    "            skip_retick = True\n",
    "    skip_retick = skip_retick or False\n",
    "    \n",
    "    x_tic_skip += 1\n",
    "    \n",
    "    if not skip_retick:\n",
    "        sns.set_style('darkgrid')\n",
    "        sns.set_context('talk', rc={'patch.linewidth': 0, 'patch.edgecolor': 'k', 'patch.facecolor': 'k'})\n",
    "    \n",
    "    _d = dtseries_or_df\n",
    "    try:\n",
    "        _d = _d.dt.__getattribute__(time_variable)\n",
    "    except:\n",
    "        try:\n",
    "            _d = _d[time_variable]\n",
    "        except:\n",
    "            pass\n",
    "    _g = _d.value_counts().sort_index()\n",
    "    if len(_g) > 1000:\n",
    "        logger.error(\"ERROR: You are trying to plot something with too many levels. Don't do that.\")\n",
    "        return \n",
    "    \n",
    "    if ax is None:\n",
    "        if 'figsize' not in kwargs:\n",
    "            kwargs['figsize'] = (13,2)\n",
    "        plt.figure(*args, **kwargs)\n",
    "        ax = plt.gca()\n",
    "        # If ax is none, assume kwargs are for figure generation.\n",
    "        kwargs = {}\n",
    "    \n",
    "    ax.bar(_g.index, _g, width=width, label=label, **kwargs)\n",
    "    \n",
    "    if not skip_retick:\n",
    "        # Format and label X axis\n",
    "        ax.set_xlim(left=_g.index.min()-0.5, right=_g.index.max()+0.5)\n",
    "        _t = _g.index[::x_tic_skip]\n",
    "        ax.set_xticks(_t)\n",
    "        ax.set_xticklabels(map(str, _t), rotation='vertical')\n",
    "\n",
    "        # Label Y Axis\n",
    "        tene = math.log10(_g.max())//1-1\n",
    "        topnum = math.ceil(_g.max() / 10**tene)\n",
    "        ax.set_yticks([(topnum * i // y_tic_number)*10**tene for i in range(y_tic_number, 0, -1)])\n",
    "    \n",
    "    return ax"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def savefig(file_name, fig, *args, default_root='.', make_name_unique=False, **kwargs):\n",
    "    \"\"\"\n",
    "    Save figure to .\n",
    "    \n",
    "    If no extension is added, PNG is assumed (at default 300 DPI).\n",
    "    \n",
    "    If make_name_unique flag is True, Y-M-D_H-M-S is added to filename.\n",
    "    \n",
    "    Returns file path that was created.\n",
    "    \"\"\"\n",
    "    _fname, _ext = os.path.splitext(file_name)\n",
    "    if not _ext:\n",
    "        _ext = '.png'\n",
    "    \n",
    "    file_path = os.path.join(default_root, _fname + _ext)\n",
    "    \n",
    "    if make_name_unique:\n",
    "        file_path = os.path.join(default_root, f\"{_fname}_{dt.date.today():%Y-%m-%d_%H-%M-%S}{_ext}\")\n",
    "    \n",
    "    default_kwargs = {\n",
    "        'bbox_inches': 'tight',\n",
    "        'pad_inches': 0.1,\n",
    "        'transparent': True\n",
    "    }\n",
    "    \n",
    "    kwargs = {**default_kwargs, **kwargs}\n",
    "    \n",
    "    if _ext == '.png':\n",
    "        if 'dpi' not in kwargs:\n",
    "            kwargs['dpi'] = 300\n",
    "    \n",
    "    fig.savefig(file_path, *args, **kwargs)\n",
    "    \n",
    "    return file_path"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "scrapy",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "214.097px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "vscode": {
   "interpreter": {
    "hash": "503afef024fa0fe44fb4c28bd7916b7ce0d5c75122984c652c96c06c5c01e085"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# system/os/regex and basic math functions\n",
	"import os\n",
	"import re\n",
	"import sys\n",
	"import math\n",
	"import json\n",
	"import time\n",
	"import string\n",
	"import dateutil\n",
	"import datetime as dt\n",
	"from itertools import chain"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Set logging level\n",
	"import logging\n",
	"try:\n",
	" kwargs = {'level':getattr(logging, LOG_LEVEL)}\n",
	"except NameError:\n",
	" kwargs = {'level':logging.WARNING}\n",
	" print('Set LOG_LEVEL=\"INFO\" before running the import file to get moar output.')\n",
	"try:\n",
	" kwargs['format'] = LOG_FORMAT\n",
	"except NameError:\n",
	" kwargs['format'] = \"%(levelname)s::%(message)s\"\n",
	" print('Set LOG_FORMAT to change log format.')\n",
	"\n",
	"logging.basicConfig(**kwargs)\n",
	"logger = logging.getLogger('notebook')\n",
	"del kwargs\n",
	"\n",
	"import warnings\n",
	"warnings.simplefilter(action='ignore', category=FutureWarning)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# IPython display convenience stuff\n",
	"try:\n",
	" from IPython.display import HTML, display, display_html, display_javascript\n",
	" from IPython import __version__ as ipythonversion\n",
	" import ipywidgets\n",
	" print(\"IPython: {}\".format(ipythonversion))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # numpy for matrix algebra\n",
	" import numpy as np\n",
	" os.environ['NUMEXPR_MAX_THREADS'] = '20'\n",
	" print(\"Numpy (np): {}\".format(np.version.full_version))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # scipy for probability distributions and some statistical tests\n",
	" import scipy as sp\n",
	" import scipy.stats as stats\n",
	" print(\"Scipy (sp, stats): {}\".format(sp.version.full_version))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # pandas for data manipulation\n",
	" import pandas as pd\n",
	" print(\"Pandas (pd): {}\".format(pd.__version__))\n",
	" \n",
	" def fmt_float(float_in, rstrip0s=re.compile(r'\\.0+$')):\n",
	" try:\n",
	" return rstrip0s.sub('', '{0:,.{1}f}'.format(float_in, 3 - 3 * bool(abs(float_in) // 1000)))\n",
	" except Exception:\n",
	" return str(float_in)\n",
	" pd.set_option('float_format', fmt_float)\n",
	" pd.set_option('display.max_rows', 250)\n",
	" pd.set_option('display.max_columns', 250)\n",
	" pd.set_option('display.notebook_repr_html', True)\n",
	" \n",
	" def latex_format(num_in):\n",
	" \"\"\"Format numbers for Latex tables\"\"\"\n",
	" try:\n",
	" num_in = float(num_in)\n",
	" num_dig = np.log10(abs(num_in)) + 1\n",
	" if num_in == 0:\n",
	" return \"0\"\n",
	" if num_dig >= 3:\n",
	" return f\"{int(num_in):,d}\"\n",
	" elif num_dig >= 1:\n",
	" return f\"{num_in:2.1f}\"\n",
	" return f\"{num_in:1.3f}\"\n",
	" except ValueError:\n",
	" return str(num_in)\n",
	" \n",
	" def S(df, cols=None, keep_dups=False):\n",
	" \"\"\"S splits strings, and if called with a df input, interpolates variable names.\n",
	" \n",
	" Example::\n",
	" S('gvkey datadate') # --> ['gvkey', 'datadate']\n",
	" df.S('gvk* datad* num*') # --> ['gvkey', 'datadate', 'num_words', 'num_sentences']\n",
	" \"\"\"\n",
	" if isinstance(df, str):\n",
	" cols = df\n",
	" if isinstance(cols, str):\n",
	" new_cols = []\n",
	" for col in cols.split():\n",
	" if '*' in col or '?' in col:\n",
	" matcher = re.compile(r'\\b'+col.replace('', '.').replace('?', '.')+r'\\b', re.I)\n",
	" new_cols.extend([c for c in df.columns if matcher.search(c)])\n",
	" else:\n",
	" new_cols.append(col)\n",
	" cols = new_cols\n",
	" return cols if keep_dups else list(dict(zip(cols, cols)))\n",
	" \n",
	" # monkeypatch C into DataFrame\n",
	" pd.DataFrame.S = S\n",
	" \n",
	" def hugetable(df, soft_max=5000, hard_max=100_000):\n",
	" max_rows = pd.options.display.max_rows\n",
	" max_columns = pd.options.display.max_columns\n",
	" pd.options.display.max_rows = min(soft_max, 100_000)\n",
	" pd.options.display.max_columns = min(soft_max, 100_000)\n",
	" display_html(df)\n",
	" pd.options.display.max_rows = max_rows\n",
	" pd.options.display.max_columns = max_columns\n",
	"except (ImportError, ModuleNotFoundError):\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # matplotlib for plotting and pyplot for MATLAB-style API\n",
	" import matplotlib as mpl\n",
	" import matplotlib.pyplot as plt\n",
	" plt.rcParams['figure.figsize'] = (15, 5) \n",
	" print(\"MatPlotLib (mpl, plt): {}\".format(mpl.__version__))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # Seaborn for pretty plotting\n",
	" import seaborn as sns\n",
	" print(\"Seaborn (sns): {}\".format(sns.__version__))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # Scikit Learn for more regressions\n",
	" import sklearn as sk\n",
	" print(\"Scikit-Learn (sk): {}\".format(sk.__version__))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # statsmodels for econometrics\n",
	" import statsmodels.api as sm\n",
	" import statsmodels.formula.api as smf\n",
	" print(\"Statsmodels (sm,smf): {}\".format(sm.__version__))\n",
	"except (ImportError, AttributeError):\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # patsy for making formulas\n",
	" import patsy as pt\n",
	" print(\"Patsy (pt): {}\".format(pt.__version__))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # SQLAlchemy for relational db management\n",
	" import sqlalchemy as sa\n",
	" print(\"SQLAlchemy (sa): {}\".format(sa.__version__))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # Gensim for textual analysis\n",
	" import gensim\n",
	" print(\"Gensim: {}\".format(gensim.__version__))\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # TQDM for progress bar outputs\n",
	" from tqdm.notebook import tqdm\n",
	"except ImportError:\n",
	" def tqdm(thing, args, *kwargs):\n",
	" return thing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # sas7bdat for reading SAS created databases\n",
	" from sas7bdat import SAS7BDAT as SASdb\n",
	" print(\"SAS8BDAT (SASdb): unknown version\")\n",
	"\n",
	" SAS_ZERO = dt.datetime(1960,1,1)\n",
	" \n",
	" def sas_date_to_datetime(df_col):\n",
	" return pd.to_timedelta(df_col, unit='d') + SAS_ZERO\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" # BeautifulSoup for HTML things\n",
	" from bs4 import BeautifulSoup\n",
	"except ImportError:\n",
	" pass"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"try:\n",
	" from pyedgar.utilities import edgarweb\n",
	"except (ImportError, ModuleNotFoundError):\n",
	" class _o_(object):\n",
	" def edgar_links(args, *kwargs):\n",
	" return ''\n",
	" edgarweb = _o_()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Some nice date constants\n",
	"MIN_DATE = dt.datetime(1900, 1, 1)\n",
	"MAX_DATE = dt.datetime.today()\n",
	"TD_DAY = pd.Timedelta(days=1)\n",
	"TD_YEAR = pd.Timedelta(days=1) * 365"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# print(\"linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession')\")\n",
	"def linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession', return_df=False):\n",
	" \"\"\"\n",
	" Displays top rows of a dataframe, and includes\n",
	" links to the HTML and FTP websites if CIK and Accession are found.\n",
	" \"\"\"\n",
	" if len(df) == 0:\n",
	" display_html(df[fields or df.columns].assign(link='').to_html(), raw=True)\n",
	" return\n",
	" \n",
	" w = pd.get_option('display.max_colwidth')\n",
	" pd.set_option('display.max_colwidth', None)\n",
	" \n",
	" if fields is None:\n",
	" fields = list(df.columns)\n",
	" \n",
	" dfn = df.head(n).copy() \n",
	" \n",
	" if cik in dfn.columns:\n",
	" linkstr, i = 'links', 0\n",
	" while linkstr in dfn.columns:\n",
	" linkstr = 'links%d' % i\n",
	" i += 1\n",
	" dfn[linkstr] = dfn.apply(lambda row: edgarweb.edgar_links(row[cik], row[accession]), axis=1)\n",
	" fields.append(linkstr)\n",
	" \n",
	" html = f\"<h4>{title}</h4>\" if title else ''\n",
	" html += dfn[fields].to_html(escape=False, index=False, na_rep=\"\")\n",
	" \n",
	" display_html(html, raw=True)\n",
	" pd.set_option('display.max_colwidth', w)\n",
	" \n",
	" if return_df: \n",
	" return dfn"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# print(\"timehist(dtseries_or_df, time_variable='year', y_tic_number=4, x_tic_skip=0, args, *kwargs)\")\n",
	"def timehist(dtseries_or_df, time_variable='year',\n",
	" y_tic_number=4, x_tic_skip=0,\n",
	" width=.9, ax=None, skip_retick=None,\n",
	" label=None, \n",
	" args, *kwargs):\n",
	" \"\"\"\n",
	" Historgam of observations per time period.\n",
	" First tries: dtseries_or_df.dt.time_variable\n",
	" Failing that, does dtseries_or_df.value_counts()\n",
	" Sends args and kwargs to figure.\n",
	" \"\"\"\n",
	" if ax is not None and skip_retick is None:\n",
	" skip_retick = True\n",
	" skip_retick = skip_retick or False\n",
	" \n",
	" x_tic_skip += 1\n",
	" \n",
	" if not skip_retick:\n",
	" sns.set_style('darkgrid')\n",
	" sns.set_context('talk', rc={'patch.linewidth': 0, 'patch.edgecolor': 'k', 'patch.facecolor': 'k'})\n",
	" \n",
	" _d = dtseries_or_df\n",
	" try:\n",
	" _d = _d.dt.__getattribute__(time_variable)\n",
	" except:\n",
	" try:\n",
	" _d = _d[time_variable]\n",
	" except:\n",
	" pass\n",
	" _g = _d.value_counts().sort_index()\n",
	" if len(_g) > 1000:\n",
	" logger.error(\"ERROR: You are trying to plot something with too many levels. Don't do that.\")\n",
	" return \n",
	" \n",
	" if ax is None:\n",
	" if 'figsize' not in kwargs:\n",
	" kwargs['figsize'] = (13,2)\n",
	" plt.figure(args, *kwargs)\n",
	" ax = plt.gca()\n",
	" # If ax is none, assume kwargs are for figure generation.\n",
	" kwargs = {}\n",
	" \n",
	" ax.bar(_g.index, _g, width=width, label=label, **kwargs)\n",
	" \n",
	" if not skip_retick:\n",
	" # Format and label X axis\n",
	" ax.set_xlim(left=_g.index.min()-0.5, right=_g.index.max()+0.5)\n",
	" _t = _g.index[::x_tic_skip]\n",
	" ax.set_xticks(_t)\n",
	" ax.set_xticklabels(map(str, _t), rotation='vertical')\n",
	"\n",
	" # Label Y Axis\n",
	" tene = math.log10(_g.max())//1-1\n",
	" topnum = math.ceil(_g.max() / 10**tene)\n",
	" ax.set_yticks([(topnum * i // y_tic_number)10*tene for i in range(y_tic_number, 0, -1)])\n",
	" \n",
	" return ax"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def savefig(file_name, fig, args, default_root='.', make_name_unique=False, *kwargs):\n",
	" \"\"\"\n",
	" Save figure to .\n",
	" \n",
	" If no extension is added, PNG is assumed (at default 300 DPI).\n",
	" \n",
	" If make_name_unique flag is True, Y-M-D_H-M-S is added to filename.\n",
	" \n",
	" Returns file path that was created.\n",
	" \"\"\"\n",
	" _fname, _ext = os.path.splitext(file_name)\n",
	" if not _ext:\n",
	" _ext = '.png'\n",
	" \n",
	" file_path = os.path.join(default_root, _fname + _ext)\n",
	" \n",
	" if make_name_unique:\n",
	" file_path = os.path.join(default_root, f\"{_fname}_{dt.date.today():%Y-%m-%d_%H-%M-%S}{_ext}\")\n",
	" \n",
	" default_kwargs = {\n",
	" 'bbox_inches': 'tight',\n",
	" 'pad_inches': 0.1,\n",
	" 'transparent': True\n",
	" }\n",
	" \n",
	" kwargs = {default_kwargs, kwargs}\n",
	" \n",
	" if _ext == '.png':\n",
	" if 'dpi' not in kwargs:\n",
	" kwargs['dpi'] = 300\n",
	" \n",
	" fig.savefig(file_path, args, *kwargs)\n",
	" \n",
	" return file_path"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "scrapy",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.0"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": false,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {
	"height": "calc(100% - 180px)",
	"left": "10px",
	"top": "150px",
	"width": "214.097px"
	},
	"toc_section_display": true,
	"toc_window_display": true
	},
	"vscode": {
	"interpreter": {
	"hash": "503afef024fa0fe44fb4c28bd7916b7ce0d5c75122984c652c96c06c5c01e085"
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}