ELC · September 17, 2020 15:47
diff --git a/apt.txt b/apt.txt
 chromium-chromedriver
diff --git a/collect_data.ipynb b/collect_data.ipynb
diff --git a/environment.yml b/environment.yml
 name: EMSS-Castano

 channels:
  - conda-forge

 dependencies:
  - python=3.7
  - numpy
  - pandas
  - scipy
  - seaborn
  - matplotlib
  - selenium
  - tqdm
  - pymc3
  - mkl-service
  - pip:
    - helium
diff --git a/inference.ipynb b/inference.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import pickle\n",
    "\n",
    "import pymc3 as pm\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import probplot, normaltest\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "import arviz as az\n",
    "import theano.tensor as tt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_running = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.style.use('bmh')\n",
    "mpl.rcParams['figure.figsize'] = (16, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"pbs_data.csv\")\n",
    "\n",
    "datetime_full = pd.to_datetime(df['date'])\n",
    "df['year'] = datetime_full.apply(lambda x:x.year)\n",
    "df['month'] = datetime_full.apply(lambda x:x.month)\n",
    "df['day'] = datetime_full.apply(lambda x:x.day)\n",
    "\n",
    "df = df.sort_values(['year', 'month', 'day'])\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\"Raw\", \"SQRT\"]\n",
    "index = [\"Poisson with Outlier\", \"Negative Binomial with Outlier\",\n",
    "         \"Poisson without Outlier\", \"Negative Binomial without Outlier\"]\n",
    "template = pd.DataFrame(columns=columns, index=index)\n",
    "mse = template.copy()\n",
    "normal_resid = template.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    with open('mse.p', 'rb') as handle:\n",
    "        mse = pickle.load(handle)\n",
    "except:\n",
    "    print(\"MSE ERROR\")\n",
    "    \n",
    "try:\n",
    "    with open('normal_resid.p', 'rb') as handle:\n",
    "        normal_resid = pickle.load(handle)\n",
    "except:\n",
    "    print(\"RESID ERROR\")\n",
    "    \n",
    "try:\n",
    "    with open('all_data.p', 'rb') as handle:\n",
    "        all_data = pickle.load(handle)\n",
    "except:\n",
    "    print(\"ALL DATA ERROR\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "normal_resid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Views"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interest_variable = df['views_count']\n",
    "n_count_data = len(df)\n",
    "variable = tt.as_tensor_variable(interest_variable)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Poisson with Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_view_poisson_with_outliers:\n",
    "    \n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1_mean = variable[:tau].mean()\n",
    "    mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
    "    \n",
    "    mu2_mean = variable[tau:].mean()\n",
    "    mu2 = pm.Exponential(\"mu_2\", 1 / mu2_mean)\n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = pm.math.switch(tau > idx, mu1, mu2)\n",
    "\n",
    "    pm.Poisson(\"x\", switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_view_poisson_with_outliers:\n",
    "    db = pm.backends.Text('poisson_with_outliers')\n",
    "    trace_views_poisson_with_outliers = pm.sample(10000, tune=15000, chains=20, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_poisson_with_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_view_poisson_with_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_poisson_with_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_poisson_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_poisson_with_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_poisson_with_outliers['mu_1']\n",
    "mu_2_samples = trace_views_poisson_with_outliers['mu_2']\n",
    "tau_samples = trace_views_poisson_with_outliers['tau']\n",
    "\n",
    "all_data['pwo_mu1'] = mu_1_samples\n",
    "all_data['pwo_mu2'] = mu_2_samples\n",
    "all_data['pwo_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
    "\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_1$\")\n",
    "\n",
    "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper center\", ncol=2)\n",
    "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
    "\n",
    "ax = axs[3]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views_1_sample = np.random.poisson(mu_1_samples)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "views_2_sample = np.random.poisson(mu_2_samples)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)\n",
    "\n",
    "views_1_upper, views_1_lower, views_2_upper, views_2_lower"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n",
    "         label=\"Expected Number of Views\")\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"Views\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = interest_variable - expected_views_per_day\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "plt.show()\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['Raw']['Poisson with Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['Raw']['Poisson with Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Negative Binomial with Outlier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_views_nbinomial_with_outliers:\n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1 = variable[tau:].mean()\n",
    "    sigma1_2 = variable[tau:].var()\n",
    "    alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
    "    \n",
    "    mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
    "    alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
    "    \n",
    "    mu2 = variable[:tau].mean()\n",
    "    sigma2_2 = variable[:tau].var()\n",
    "    alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
    "    \n",
    "    mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
    "    alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2)    \n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = tau > idx\n",
    "    \n",
    "    mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
    "    alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
    "\n",
    "    pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with model_views_nbinomial_with_outliers:\n",
    "    db = pm.backends.Text('negative_binomial_with_outliers')\n",
    "    trace_views_nbinomial_with_outliers = pm.sample(10000, tune=25000, chains=20, cores=4, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_nbinomial_with_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_views_nbinomial_with_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
    "\n",
    "ax = axs[2]\n",
    "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_nbinomial_with_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_nbinomial_with_outliers['mu_1']\n",
    "mu_2_samples = trace_views_nbinomial_with_outliers['mu_2']\n",
    "alpha_1_samples = trace_views_nbinomial_with_outliers['alpha_1']\n",
    "alpha_2_samples = trace_views_nbinomial_with_outliers['alpha_2']\n",
    "tau_samples = trace_views_nbinomial_with_outliers['tau']\n",
    "\n",
    "all_data['nbwo_mu1'] = mu_1_samples\n",
    "all_data['nbwo_mu2'] = mu_2_samples\n",
    "all_data['nbwo_alpha1'] = alpha_1_samples\n",
    "all_data['nbwo_alpha2'] = alpha_2_samples\n",
    "all_data['nbwo_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_1$\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[3]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_1$\")\n",
    "\n",
    "ax = axs[4]\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_2$\")\n",
    "\n",
    "ax = axs[5]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[6]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "significance = 0.05\n",
    "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
    "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
    "views_1_upper = np.quantile(views_1_sample, significance / 2)\n",
    "views_1_lower = np.quantile(views_1_sample, 1 - significance / 2)\n",
    "\n",
    "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
    "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
    "views_2_upper = np.quantile(views_2_sample, significance / 2)\n",
    "views_2_lower = np.quantile(views_2_sample, 1 - significance / 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    \n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = interest_variable - expected_views_per_day\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "plt.show()\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['Raw']['Negative Binomial with Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['Raw']['Negative Binomial with Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Views"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"pbs_data.csv\")\n",
    "\n",
    "for _ in range(3):\n",
    "    index = df['views_count'].idxmax()\n",
    "    df = df.drop(index)\n",
    "\n",
    "datetime_full = pd.to_datetime(df['date'])\n",
    "df['year'] = datetime_full.apply(lambda x:x.year)\n",
    "df['month'] = datetime_full.apply(lambda x:x.month)\n",
    "df['day'] = datetime_full.apply(lambda x:x.day)\n",
    "\n",
    "df = df.sort_values(['year', 'month', 'day'])\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interest_variable = df['views_count']\n",
    "n_count_data = len(df)\n",
    "variable = tt.as_tensor_variable(interest_variable)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Poisson without Outlier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_views_poisson_without_outliers:\n",
    "    \n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1_mean = variable[:tau].mean()\n",
    "    mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
    "    \n",
    "    mu2_mean = variable[tau:].mean()\n",
    "    mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = pm.math.switch(tau > idx, mu1, mu2)\n",
    "\n",
    "    pm.Poisson(\"x\", switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_views_poisson_without_outliers:\n",
    "    db = pm.backends.Text('poisson_without_outliers')\n",
    "    step = pm.Metropolis()\n",
    "    trace_views_poisson_without_outliers = pm.sample(15000, tune=25000, chains=20, cores=4, step=step, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_poisson_without_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_views_poisson_without_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_poisson_without_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_poisson_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_poisson_without_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_poisson_without_outliers['mu_1']\n",
    "mu_2_samples = trace_views_poisson_without_outliers['mu_2']\n",
    "tau_samples = trace_views_poisson_without_outliers['tau']\n",
    "\n",
    "all_data['pwoo_mu1'] = mu_1_samples\n",
    "all_data['pwoo_mu2'] = mu_2_samples\n",
    "all_data['pwoo_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_1$\")\n",
    "\n",
    "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper center\", ncol=2)\n",
    "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
    "\n",
    "ax = axs[3]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views_1_sample = np.random.poisson(mu_1_samples)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "views_2_sample = np.random.poisson(mu_2_samples)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = interest_variable - expected_views_per_day\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['Raw']['Poisson without Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['Raw']['Poisson without Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Negative Binomial without Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_views_nbinomial_without_outliers:\n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1 = variable[tau:].mean()\n",
    "    sigma1_2 = variable[tau:].var()\n",
    "    alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
    "    \n",
    "    mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
    "    alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
    "    \n",
    "    mu2 = variable[:tau].mean()\n",
    "    sigma2_2 = variable[:tau].var()\n",
    "    alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
    "    \n",
    "    mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
    "    alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2)    \n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = tau > idx\n",
    "    \n",
    "    mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
    "    alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
    "\n",
    "    pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_views_nbinomial_without_outliers:\n",
    "    db = pm.backends.Text('negative_binomial_without_outliers')\n",
    "    trace_views_nbinomial_without_outliers = pm.sample(60000, tune=15000, chains=20, cores=4, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model:2.2f}s aprox {round((time.perf_counter() - before_model) / 60):2.2f}min\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_nbinomial_without_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_views_nbinomial_without_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
    "\n",
    "ax = axs[2]\n",
    "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_nbinomial_without_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_nbinomial_without_outliers['mu_1']\n",
    "mu_2_samples = trace_views_nbinomial_without_outliers['mu_2']\n",
    "alpha_1_samples = trace_views_nbinomial_without_outliers['alpha_1']\n",
    "alpha_2_samples = trace_views_nbinomial_without_outliers['alpha_2']\n",
    "tau_samples = trace_views_nbinomial_without_outliers['tau']\n",
    "\n",
    "all_data['nbwoo_mu1'] = mu_1_samples\n",
    "all_data['nbwoo_mu2'] = mu_2_samples\n",
    "all_data['nbwoo_alpha1'] = alpha_1_samples\n",
    "all_data['nbwoo_alpha2'] = alpha_2_samples\n",
    "all_data['nbwoo_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_1$\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[3]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_1$\")\n",
    "\n",
    "ax = axs[4]\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_2$\")\n",
    "\n",
    "ax = axs[5]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[6]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = all_data['nbwoo_mu1']\n",
    "mu_2_samples = all_data['nbwoo_mu2']\n",
    "tau_samples = all_data['nbwoo_tau']\n",
    "alpha_1_samples = all_data['nbwoo_alpha1']\n",
    "alpha_2_samples = all_data['nbwoo_alpha2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(16, 5))\n",
    "\n",
    "values = np.array(tau_samples, dtype=float)\n",
    "bins = np.arange(values.min(), values.max() + 1.5) - 0.5\n",
    "ax.hist(values, bins=bins, rwidth=0.9, density=True, label='τ Posterior density')\n",
    "plt.xlim(29, 48)\n",
    "plt.xticks(np.arange(29, 49, 1))\n",
    "\n",
    "sns.kdeplot(values, lw=3, bw=0.75, gridsize=200, linestyle=\"--\", color=\"#333333\", label=\"KDE for τ\")\n",
    "\n",
    "ax.set_axisbelow(True)\n",
    "\n",
    "plt.legend(ncol=2)\n",
    "plt.title(\"Posterior Distribution for Tau\")\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Probability\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(16, 5))\n",
    "plt.hist(mu_1_samples, bins=1400, alpha=0.85, label=f\"Posterior Distribution of $\\mu_1$\", density=True)\n",
    "plt.hist(mu_2_samples, bins=800, alpha=0.85, label=f\"Posterior Distribution of $\\mu_2$\", density=True)\n",
    "plt.legend(loc=\"upper right\", ncol=2)\n",
    "plt.xlabel(\"Number of Views (x $100.000$)\")\n",
    "plt.ylabel(\"Probability (x $10^{-5}$)\")\n",
    "plt.title(f\"Posterior Distribution Comparison\")\n",
    "plt.xlim(6e4, 22e4)\n",
    "plt.xticks(np.arange(6e4, 22.5e4, 5e3))\n",
    "ax.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0, 0))\n",
    "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
    "# plt.yscale('log')\n",
    "\n",
    "ax.set_axisbelow(True)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
    "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
    "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(16, 5))\n",
    "\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    \n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "ax.plot(range(1, n_count_data + 1), expected_views_per_day, lw=3, color=\"#404045\",\n",
    "         label=\"Expected Views\", ls=\"-.\")\n",
    "\n",
    "ax.plot(range(1, n_count_data + 1), expected_views_upper, lw=2, color=\"#7A68A6\", ls=\"--\")\n",
    "\n",
    "ax.plot(range(1, n_count_data + 1), expected_views_lower, lw=2, color=\"#7A68A6\", ls=\"--\",\n",
    "         label=\"95% Probabiliy Density\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 1) + 1)\n",
    "plt.xlim(0.8, len(df) + 0.1)\n",
    "plt.xlabel(\"Video ID\")\n",
    "\n",
    "plt.ylim(0, 3.8e5)\n",
    "plt.ylabel(\"Views (x $10.000$)\")\n",
    "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
    "\n",
    "\n",
    "ax.axvline(37, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4)\n",
    "ax.axvline(42, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4, label=\"Modes of τ\")\n",
    "\n",
    "ax.fill_betweenx(y=[0, 3.8e5], x1=33, x2=45, alpha=0.08, color='#009E73', label=\"95% HPD for τ\")\n",
    "\n",
    "plt.scatter(np.arange(len(interest_variable))+1, interest_variable, color=\"#348ABD\", label=\"Views\")\n",
    "\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "labels_sorted = [labels[0], labels[2], labels[4], labels[1], labels[3]]\n",
    "handles_sorted = [handles[0], handles[2], handles[4], handles[1], handles[3]]\n",
    "\n",
    "plt.title(\"Posterior Predictive Distribution of Views\")\n",
    "plt.legend(handles_sorted, labels_sorted, loc=\"upper right\", ncol=2)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scale = 1.2\n",
    "plt.figure(figsize=(14.28 * scale, 6 * scale))\n",
    "\n",
    "ax = plt.subplot(212)\n",
    "\n",
    "ax.plot(range(1, n_count_data + 1), expected_views_per_day, lw=3, color=\"#404045\",\n",
    "         label=\"Expected Views\", ls=\"-.\")\n",
    "\n",
    "ax.plot(range(1, n_count_data + 1), expected_views_upper, lw=2, color=\"#7A68A6\", ls=\"--\")\n",
    "\n",
    "ax.plot(range(1, n_count_data + 1), expected_views_lower, lw=2, color=\"#7A68A6\", ls=\"--\",\n",
    "         label=\"95% Probabiliy Density\")\n",
    "\n",
    "ax.set_xticks(np.arange(0, len(df), 1) + 1)\n",
    "ax.set_xlim(0.8, len(df) + 0.1)\n",
    "ax.set_xlabel(\"Video ID\")\n",
    "\n",
    "ax.set_ylim(0, 3.8e5)\n",
    "ax.set_ylabel(\"Views (x $100.000$)\")\n",
    "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
    "\n",
    "ax.axvline(37, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4)\n",
    "ax.axvline(42, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4, label=\"Modes of τ\")\n",
    "\n",
    "ax.fill_betweenx(y=[0, 3.8e5], x1=35, x2=45, alpha=0.08, color='#009E73', label=\"95% HPD for τ\")\n",
    "\n",
    "ax.scatter(np.arange(len(interest_variable))+1, interest_variable, color=\"#348ABD\", label=\"Views\")\n",
    "\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "labels_sorted = [labels[0], labels[2], labels[4], labels[1], labels[3]]\n",
    "handles_sorted = [handles[0], handles[2], handles[4], handles[1], handles[3]]\n",
    "\n",
    "ax.set_title(\"Posterior Predictive Distribution of Views\")\n",
    "ax.legend(handles_sorted, labels_sorted, loc=\"upper right\", ncol=2, prop={'size': 11})\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "ax = plt.subplot(221)\n",
    "\n",
    "ax.hist(mu_1_samples, bins=1400, label=f\"$\\mu_1$ Posterior Density\", density=True)\n",
    "ax.hist(mu_2_samples, bins=800, label=f\"$\\mu_2$ Posterior Density\", density=True)\n",
    "\n",
    "ax.set_xlim(6e4, 22e4)\n",
    "ax.set_xticks(np.arange(6e4, 22.5e4, 1e4))\n",
    "ax.set_xlabel(\"Number of Views (x $100.000$)\")\n",
    "ax.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0, 0))\n",
    "\n",
    "ax.set_ylim(0, 5.4e-5)\n",
    "ax.set_yticks(np.arange(0, 5.5e-5, 5e-6))\n",
    "ax.set_ylabel(\"Probability (x $10^{-5}$)\")\n",
    "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
    "\n",
    "ax.set_axisbelow(True)\n",
    "ax.set_title(f\"Likelihood Parameters Posteriors\")\n",
    "ax.legend(loc=\"upper right\", prop={'size': 11})\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "ax = plt.subplot(222)\n",
    "\n",
    "values = np.array(tau_samples, dtype=float)\n",
    "bins = np.arange(values.min(), values.max() + 1.5) - 0.5\n",
    "ax.hist(values, bins=bins, rwidth=0.9, density=True, label='τ Posterior Density')\n",
    "sns.kdeplot(values, lw=3, bw=0.75, gridsize=200, linestyle=\"--\", color=\"#333333\", label=\"KDE for τ\")\n",
    "\n",
    "ax.set_xlim(29, 48)\n",
    "ax.set_xticks(np.arange(29, 49, 1))\n",
    "ax.set_xlabel(\"Video ID\")\n",
    "\n",
    "ax.set_ylim(0, 0.195)\n",
    "ax.set_yticks(np.arange(0, 0.2, 0.02))\n",
    "ax.set_ylabel(\"Probability (x $10^{-1}$)\")\n",
    "ax.ticklabel_format(axis='y', style='sci', scilimits=(0,0))\n",
    "\n",
    "ax.set_axisbelow(True)\n",
    "ax.set_title(\"Change Point Posterior\")\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "ax.legend(handles[::-1], labels[::-1], loc=\"upper right\", prop={'size': 11})\n",
    "\n",
    "plt.suptitle('Posterior and Posterior Predictive Distributions', fontsize=16)\n",
    "plt.tight_layout()\n",
    "plt.subplots_adjust(top=0.91)\n",
    "plt.savefig(\"posterior.pdf\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "resid = interest_variable - expected_views_per_day\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['Raw']['Negative Binomial without Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['Raw']['Negative Binomial without Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "normal_resid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SQRT Views"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"pbs_data.csv\")\n",
    "\n",
    "datetime_full = pd.to_datetime(df['date'])\n",
    "df['year'] = datetime_full.apply(lambda x:x.year)\n",
    "df['month'] = datetime_full.apply(lambda x:x.month)\n",
    "df['day'] = datetime_full.apply(lambda x:x.day)\n",
    "\n",
    "df = df.sort_values(['year', 'month', 'day'])\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interest_variable = np.sqrt(df['views_count'])\n",
    "n_count_data = len(df)\n",
    "variable = tt.as_tensor_variable(interest_variable)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Poisson with Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_view_poisson_with_outliers:\n",
    "    \n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1_mean = variable[:tau].mean()\n",
    "    mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
    "    \n",
    "    mu2_mean = variable[tau:].mean()\n",
    "    mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = pm.math.switch(tau > idx, mu1, mu2)\n",
    "\n",
    "    pm.Poisson(\"x\", switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_view_poisson_with_outliers:\n",
    "    db = pm.backends.Text('poisson_sqrt_with_outliers')\n",
    "    trace_views_poisson_with_outliers = pm.sample(draws=200000, tune=500000, chains=20, cores=4, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "duration = time.perf_counter() - before_model\n",
    "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_poisson_with_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_view_poisson_with_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_poisson_with_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_poisson_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_poisson_with_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_poisson_with_outliers['mu_1']\n",
    "mu_2_samples = trace_views_poisson_with_outliers['mu_2']\n",
    "tau_samples = trace_views_poisson_with_outliers['tau']\n",
    "\n",
    "all_data['pwo_sqrt_mu1'] = mu_1_samples\n",
    "all_data['pwo_sqrt_mu2'] = mu_2_samples\n",
    "all_data['pwo_sqrt_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
    "\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_1$\")\n",
    "\n",
    "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper center\", ncol=2)\n",
    "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
    "\n",
    "ax = axs[3]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views_1_sample = np.random.poisson(mu_1_samples)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "views_2_sample = np.random.poisson(mu_2_samples)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)\n",
    "\n",
    "views_1_upper, views_1_lower, views_2_upper, views_2_lower"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "plt.show()\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['SQRT']['Poisson with Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['SQRT']['Poisson with Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Negative Binomial with Outlier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_views_nbinomial_with_outliers:\n",
    "    \n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1 = variable[tau:].mean()\n",
    "    sigma1_2 = variable[tau:].var()\n",
    "    alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
    "    \n",
    "    mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
    "    alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
    "    \n",
    "    mu2 = variable[:tau].mean()\n",
    "    sigma2_2 = variable[:tau].var()\n",
    "    alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
    "    \n",
    "    mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
    "    alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2)    \n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = tau > idx\n",
    "    \n",
    "    mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
    "    alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
    "\n",
    "    pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_views_nbinomial_with_outliers:\n",
    "    db = pm.backends.Text('negative_binomial_sqrt_with_outliers')\n",
    "    trace_views_nbinomial_with_outliers = pm.sample(tune=80000, chains=20, cores=4, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "duration = time.perf_counter() - before_model\n",
    "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_nbinomial_with_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_views_nbinomial_with_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
    "\n",
    "ax = axs[2]\n",
    "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_nbinomial_with_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_nbinomial_with_outliers['mu_1']\n",
    "mu_2_samples = trace_views_nbinomial_with_outliers['mu_2']\n",
    "alpha_1_samples = trace_views_nbinomial_with_outliers['alpha_1']\n",
    "alpha_2_samples = trace_views_nbinomial_with_outliers['alpha_2']\n",
    "tau_samples = trace_views_nbinomial_with_outliers['tau']\n",
    "\n",
    "all_data['nbwo_sqrt_mu1'] = mu_1_samples\n",
    "all_data['nbwo_sqrt_mu2'] = mu_2_samples\n",
    "all_data['nbwo_sqrt_alpha1'] = alpha_1_samples\n",
    "all_data['nbwo_sqrt_alpha2'] = alpha_2_samples\n",
    "all_data['nbwo_sqrt_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_1$\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[3]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_1$\")\n",
    "\n",
    "ax = axs[4]\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_2$\")\n",
    "\n",
    "ax = axs[5]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[6]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
    "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
    "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    \n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "plt.show()\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['SQRT']['Negative Binomial with Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['SQRT']['Negative Binomial with Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Views"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"pbs_data.csv\")\n",
    "\n",
    "for _ in range(3):\n",
    "    index = df['views_count'].idxmax()\n",
    "    df = df.drop(index)\n",
    "\n",
    "datetime_full = pd.to_datetime(df['date'])\n",
    "df['year'] = datetime_full.apply(lambda x:x.year)\n",
    "df['month'] = datetime_full.apply(lambda x:x.month)\n",
    "df['day'] = datetime_full.apply(lambda x:x.day)\n",
    "\n",
    "df = df.sort_values(['year', 'month', 'day'])\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interest_variable = np.sqrt(df['views_count'])\n",
    "n_count_data = len(df)\n",
    "variable = tt.as_tensor_variable(interest_variable)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Poisson without Outlier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_views_poisson_without_outliers:\n",
    "    \n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1_mean = variable[:tau].mean()\n",
    "    mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
    "    \n",
    "    mu2_mean = variable[tau:].mean()\n",
    "    mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = pm.math.switch(tau > idx, mu1, mu2)\n",
    "\n",
    "    pm.Poisson(\"x\", switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_views_poisson_without_outliers:\n",
    "    db = pm.backends.Text('poisson_sqrt_without_outliers')\n",
    "    trace_views_poisson_without_outliers = pm.sample(tune=20000, chains=20, cores=4, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "duration = time.perf_counter() - before_model\n",
    "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_poisson_without_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.model_to_graphviz(model_views_poisson_without_outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_poisson_without_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_poisson_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_poisson_without_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_poisson_without_outliers['mu_1']\n",
    "mu_2_samples = trace_views_poisson_without_outliers['mu_2']\n",
    "tau_samples = trace_views_poisson_without_outliers['tau']\n",
    "\n",
    "all_data['pwoo_sqrt_mu1'] = mu_1_samples\n",
    "all_data['pwoo_sqrt_mu2'] = mu_2_samples\n",
    "all_data['pwoo_sqrt_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_1$\")\n",
    "\n",
    "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$\\mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
    "ax.legend(loc=\"upper center\", ncol=2)\n",
    "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
    "\n",
    "ax = axs[3]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views_1_sample = np.random.poisson(mu_1_samples)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "views_2_sample = np.random.poisson(mu_2_samples)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['SQRT']['Poisson without Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['SQRT']['Poisson without Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Negative Binomial without Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "before_model = time.perf_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pm.Model() as model_views_nbinomial_without_outliers:\n",
    "    \n",
    "    tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
    "    \n",
    "    mu1 = variable[tau:].mean()\n",
    "    sigma1_2 = variable[tau:].var()\n",
    "    alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
    "    \n",
    "    mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
    "    alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
    "    \n",
    "    mu2 = variable[:tau].mean()\n",
    "    sigma2_2 = variable[:tau].var()\n",
    "    alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
    "    \n",
    "    mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
    "    alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2)    \n",
    "    \n",
    "    idx = np.arange(len(df))\n",
    "    \n",
    "    switch = tau > idx\n",
    "    \n",
    "    mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
    "    alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
    "\n",
    "    pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with model_views_nbinomial_without_outliers:\n",
    "    db = pm.backends.Text('negative_binomial_sqrt_without_outliers')\n",
    "    trace_views_nbinomial_without_outliers = pm.sample(tune=20000, chains=20, cores=4, trace=db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "duration = time.perf_counter() - before_model\n",
    "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pm.summary(trace_views_nbinomial_without_outliers).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
    "\n",
    "ax = axs[0]\n",
    "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['tau'], ax=ax);\n",
    "\n",
    "ax = axs[1]\n",
    "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
    "\n",
    "ax = axs[2]\n",
    "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = az.from_pymc3(\n",
    "    trace=trace_views_nbinomial_without_outliers,\n",
    ")\n",
    "az.plot_trace(data);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu_1_samples = trace_views_nbinomial_without_outliers['mu_1']\n",
    "mu_2_samples = trace_views_nbinomial_without_outliers['mu_2']\n",
    "alpha_1_samples = trace_views_nbinomial_without_outliers['alpha_1']\n",
    "alpha_2_samples = trace_views_nbinomial_without_outliers['alpha_2']\n",
    "tau_samples = trace_views_nbinomial_without_outliers['tau']\n",
    "\n",
    "all_data['nbwoo_sqrt_mu1'] = mu_1_samples\n",
    "all_data['nbwoo_sqrt_mu2'] = mu_2_samples\n",
    "all_data['nbwoo_sqrt_alpha1'] = alpha_1_samples\n",
    "all_data['nbwoo_sqrt_alpha2'] = alpha_2_samples\n",
    "all_data['nbwoo_sqrt_tau'] = tau_samples\n",
    "\n",
    "with open('all_data.p', 'wb') as handle:\n",
    "    pickle.dump(all_data, handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
    "\n",
    "ax = axs[0]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_1$\")\n",
    "\n",
    "ax = axs[1]\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$mu_2$\")\n",
    "\n",
    "ax = axs[2]\n",
    "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
    "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[3]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_1$\")\n",
    "\n",
    "ax = axs[4]\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(\"$alpha_2$\")\n",
    "\n",
    "ax = axs[5]\n",
    "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
    "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
    "ax.legend(loc=\"upper left\")\n",
    "\n",
    "ax = axs[6]\n",
    "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
    "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
    "         label=r\"posterior of $\\tau$\",\n",
    "         color=\"#467821\", weights=w)\n",
    "ax.legend(loc=\"upper left\")\n",
    "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
    "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
    "views_1_upper = np.quantile(views_1_sample, 0.025)\n",
    "views_1_lower = np.quantile(views_1_sample, 0.975)\n",
    "\n",
    "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
    "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
    "views_2_upper = np.quantile(views_2_sample, 0.025)\n",
    "views_2_lower = np.quantile(views_2_sample, 0.975)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    \n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Residuals Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "plt.scatter(np.arange(len(df)), resid)\n",
    "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
    "print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 5))\n",
    "x, y = probplot(resid, plot=plt);\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "_, p = normaltest(resid)\n",
    "print(f\"P-Value for Normal Test: {p:.4g}\")\n",
    "if p > 1e-3:\n",
    "    print(\"Suggest that the residuals are normaly distributed\")\n",
    "else:\n",
    "    print(\"Suggest that the residuals are NOT normaly distributed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse['SQRT']['Negative Binomial without Outlier'] = np.power(resid, 2).mean()\n",
    "normal_resid['SQRT']['Negative Binomial without Outlier'] = p\n",
    "\n",
    "with open('mse.p', 'wb') as handle:\n",
    "    pickle.dump(mse, handle)\n",
    "\n",
    "with open('normal_resid.p', 'wb') as handle:\n",
    "    pickle.dump(normal_resid, handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Showing all Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse_scaled = mse ** 0.5\n",
    "mse_scaled = mse_scaled.drop('LOG', axis=1)\n",
    "mse_scaled /= mse_scaled.min().min()\n",
    "mse_scaled = mse_scaled.astype(float).round(2)\n",
    "mse_scaled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "normal_resid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "normal_resid_scale = normal_resid.copy()\n",
    "normal_resid_scale = normal_resid_scale.drop('LOG', axis=1)\n",
    "normal_resid_scale *= 100\n",
    "normal_resid_scale > 0.5\n",
    "normal_resid_scale"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "elapsed_time = time.perf_counter() - start_running\n",
    "print(f\"Total elapsed time: {elapsed_time:5.2f} s\")\n",
    "print(f\"Total elapsed time: {int((elapsed_time / 60) % 60):5.0f} min\")\n",
    "print(f\"Total elapsed time: {int((elapsed_time / 60 /60) % 24):5.0f} hs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = tau_samples.shape[0]\n",
    "expected_views_per_day = np.zeros(n_count_data)\n",
    "expected_views_upper = np.zeros(n_count_data)\n",
    "expected_views_lower = np.zeros(n_count_data)\n",
    "\n",
    "plt.figure(figsize=(16, 5))\n",
    "\n",
    "for day in range(0, n_count_data):\n",
    "    ix = day < tau_samples\n",
    "    \n",
    "    expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
    "    expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
    "    expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
    "         label=\"expected number of text-messages received\")\n",
    "\n",
    "plt.xticks(np.arange(0, len(df), 2))\n",
    "plt.xlim(-1, len(df))\n",
    "plt.xlabel(\"Video ID\")\n",
    "plt.ylabel(\"Views\")\n",
    "plt.title(\"Expected number of views per video\")\n",
    "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Execution Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "elapsed_time = time.perf_counter() - start_running\n",
    "print(f\"Total elapsed time: {elapsed_time:5.2f} s\")\n",
    "print(f\"Total elapsed time: {int((elapsed_time / 60) % 60):5.0f} min\")\n",
    "print(f\"Total elapsed time: {int((elapsed_time / 60 /60) % 24):5.0f} hs\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
diff --git a/paper_figures.ipynb b/paper_figures.ipynb
diff --git a/pbs_data.csv b/pbs_data.csv
	name: EMSS-Castano

	channels:
	- conda-forge

	dependencies:
	- python=3.7
	- numpy
	- pandas
	- scipy
	- seaborn
	- matplotlib
	- selenium
	- tqdm
	- pymc3
	- mkl-service
	- pip:
	- helium
views_count	date	likes	dislikes	comments_count
118579	17 May 2018	4243	675	1290
102081	17 May 2018	1731	143	375
54220	26 Apr 2018	1415	65	188
60052	13 Apr 2018	1490	37	141
37976	29 Mar 2018	1569	42	167
98477	23 Mar 2018	2471	69	784
43906	15 Mar 2018	1080	51	208
82826	8 Mar 2018	2412	105	398
77348	1 Mar 2018	2257	26	377
82363	27 Feb 2018	3229	49	608
77653	15 Feb 2018	2008	46	278
133253	1 Feb 2018	3354	186	376
140062	25 Jan 2018	4496	90	639
69011	18 Jan 2018	1869	344	315
60526	11 Jan 2018	2126	34	187
67910	21 Dec 2017	1974	267	359
63290	14 Dec 2017	2872	34	249
84684	12 Dec 2017	2798	49	330
70405	30 Nov 2017	2721	120	404
104901	23 Nov 2017	4251	201	613
139189	17 Nov 2017	6433	95	954
114225	10 Nov 2017	3171	86	262
703089	19 Oct 2017	17149	991	2374
144368	12 Oct 2017	4645	75	547
69256	6 Oct 2017	1845	23	207
101029	28 Sep 2017	2907	58	319
127403	21 Sep 2017	3748	80	540
139737	14 Sep 2017	3635	81	684
226200	24 Aug 2017	6287	130	569
102046	23 Aug 2017	2929	44	275
137518	22 Aug 2017	3799	43	259
88791	10 Aug 2017	2985	190	659
265561	3 Aug 2017	8735	197	768
346312	21 Jul 2017	9094	644	1036
114411	13 Jul 2017	2824	63	392
83106	29 Jun 2017	2149	45	351
122523	22 Jun 2017	2359	109	607
106887	15 Jun 2017	3910	45	601
92765	8 Jun 2017	3078	90	420
98660	1 Jun 2017	4007	46	402
222122	19 May 2017	5939	188	889
126756	11 May 2017	3710	60	487
103940	4 May 2017	2810	68	413
198880	27 Apr 2017	4721	83	418
210136	20 Apr 2017	5472	104	480
193191	13 Apr 2017	4576	158	919
23375	10 Apr 2017	1415	62	275
89182	6 Apr 2017	2375	53	386
174447	23 Mar 2017	4461	98	488
91416	16 Mar 2017	2561	53	377
168412	9 Mar 2017	5278	110	662
312267	2 Mar 2017	6188	419	1156
72497	23 Feb 2017	2103	57	284
566168	16 Feb 2017	14809	336	1377
158059	2 Feb 2017	3683	53	546
241847	26 Jan 2017	6259	185	997
313571	19 Jan 2017	7921	140	980
153137	12 Jan 2017	4617	59	606
274378	5 Jan 2017	8077	404	1304
143995	22 Dec 2016	3565	75	403
169567	15 Dec 2016	4033	70	968
212652	8 Dec 2016	5400	145	1134
111195	1 Dec 2016	3529	46	548
318748	24 Nov 2016	8585	352	1826
872483	17 Nov 2016	23675	745	2391