Last active
September 17, 2020 15:47
-
-
Save ELC/6a0f1716bc39d0ef21285722f0c57bd3 to your computer and use it in GitHub Desktop.
EMSS2020
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| chromium-chromedriver |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: EMSS-Castano | |
| channels: | |
| - conda-forge | |
| dependencies: | |
| - python=3.7 | |
| - numpy | |
| - pandas | |
| - scipy | |
| - seaborn | |
| - matplotlib | |
| - selenium | |
| - tqdm | |
| - pymc3 | |
| - mkl-service | |
| - pip: | |
| - helium |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import time\n", | |
| "import pickle\n", | |
| "\n", | |
| "import pymc3 as pm\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "from scipy.stats import probplot, normaltest\n", | |
| "\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import matplotlib as mpl\n", | |
| "import arviz as az\n", | |
| "import theano.tensor as tt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "start_running = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.style.use('bmh')\n", | |
| "mpl.rcParams['figure.figsize'] = (16, 10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_csv(\"pbs_data.csv\")\n", | |
| "\n", | |
| "datetime_full = pd.to_datetime(df['date'])\n", | |
| "df['year'] = datetime_full.apply(lambda x:x.year)\n", | |
| "df['month'] = datetime_full.apply(lambda x:x.month)\n", | |
| "df['day'] = datetime_full.apply(lambda x:x.day)\n", | |
| "\n", | |
| "df = df.sort_values(['year', 'month', 'day'])\n", | |
| "df = df.reset_index(drop=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "columns = [\"Raw\", \"SQRT\"]\n", | |
| "index = [\"Poisson with Outlier\", \"Negative Binomial with Outlier\",\n", | |
| " \"Poisson without Outlier\", \"Negative Binomial without Outlier\"]\n", | |
| "template = pd.DataFrame(columns=columns, index=index)\n", | |
| "mse = template.copy()\n", | |
| "normal_resid = template.copy()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "all_data = {}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "try:\n", | |
| " with open('mse.p', 'rb') as handle:\n", | |
| " mse = pickle.load(handle)\n", | |
| "except:\n", | |
| " print(\"MSE ERROR\")\n", | |
| " \n", | |
| "try:\n", | |
| " with open('normal_resid.p', 'rb') as handle:\n", | |
| " normal_resid = pickle.load(handle)\n", | |
| "except:\n", | |
| " print(\"RESID ERROR\")\n", | |
| " \n", | |
| "try:\n", | |
| " with open('all_data.p', 'rb') as handle:\n", | |
| " all_data = pickle.load(handle)\n", | |
| "except:\n", | |
| " print(\"ALL DATA ERROR\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "normal_resid" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "all_data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Views" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "interest_variable = df['views_count']\n", | |
| "n_count_data = len(df)\n", | |
| "variable = tt.as_tensor_variable(interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Poisson with Outliers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_view_poisson_with_outliers:\n", | |
| " \n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1_mean = variable[:tau].mean()\n", | |
| " mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n", | |
| " \n", | |
| " mu2_mean = variable[tau:].mean()\n", | |
| " mu2 = pm.Exponential(\"mu_2\", 1 / mu2_mean)\n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = pm.math.switch(tau > idx, mu1, mu2)\n", | |
| "\n", | |
| " pm.Poisson(\"x\", switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_view_poisson_with_outliers:\n", | |
| " db = pm.backends.Text('poisson_with_outliers')\n", | |
| " trace_views_poisson_with_outliers = pm.sample(10000, tune=15000, chains=20, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_poisson_with_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_view_poisson_with_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_poisson_with_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_poisson_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_poisson_with_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_poisson_with_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_poisson_with_outliers['mu_2']\n", | |
| "tau_samples = trace_views_poisson_with_outliers['tau']\n", | |
| "\n", | |
| "all_data['pwo_mu1'] = mu_1_samples\n", | |
| "all_data['pwo_mu2'] = mu_2_samples\n", | |
| "all_data['pwo_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n", | |
| "\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_1$\")\n", | |
| "\n", | |
| "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper center\", ncol=2)\n", | |
| "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "views_1_sample = np.random.poisson(mu_1_samples)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "views_2_sample = np.random.poisson(mu_2_samples)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)\n", | |
| "\n", | |
| "views_1_upper, views_1_lower, views_2_upper, views_2_lower" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n", | |
| " label=\"Expected Number of Views\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"Views\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = interest_variable - expected_views_per_day\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "plt.show()\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['Raw']['Poisson with Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['Raw']['Poisson with Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Negative Binomial with Outlier" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_views_nbinomial_with_outliers:\n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1 = variable[tau:].mean()\n", | |
| " sigma1_2 = variable[tau:].var()\n", | |
| " alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n", | |
| " \n", | |
| " mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n", | |
| " alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n", | |
| " \n", | |
| " mu2 = variable[:tau].mean()\n", | |
| " sigma2_2 = variable[:tau].var()\n", | |
| " alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n", | |
| " \n", | |
| " mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n", | |
| " alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = tau > idx\n", | |
| " \n", | |
| " mu_switch = pm.math.switch(switch, mu1_, mu2_)\n", | |
| " alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n", | |
| "\n", | |
| " pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "with model_views_nbinomial_with_outliers:\n", | |
| " db = pm.backends.Text('negative_binomial_with_outliers')\n", | |
| " trace_views_nbinomial_with_outliers = pm.sample(10000, tune=25000, chains=20, cores=4, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_nbinomial_with_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_views_nbinomial_with_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_nbinomial_with_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_nbinomial_with_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_nbinomial_with_outliers['mu_2']\n", | |
| "alpha_1_samples = trace_views_nbinomial_with_outliers['alpha_1']\n", | |
| "alpha_2_samples = trace_views_nbinomial_with_outliers['alpha_2']\n", | |
| "tau_samples = trace_views_nbinomial_with_outliers['tau']\n", | |
| "\n", | |
| "all_data['nbwo_mu1'] = mu_1_samples\n", | |
| "all_data['nbwo_mu2'] = mu_2_samples\n", | |
| "all_data['nbwo_alpha1'] = alpha_1_samples\n", | |
| "all_data['nbwo_alpha2'] = alpha_2_samples\n", | |
| "all_data['nbwo_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_1$\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_1$\")\n", | |
| "\n", | |
| "ax = axs[4]\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_2$\")\n", | |
| "\n", | |
| "ax = axs[5]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[6]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "significance = 0.05\n", | |
| "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n", | |
| "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n", | |
| "views_1_upper = np.quantile(views_1_sample, significance / 2)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 1 - significance / 2)\n", | |
| "\n", | |
| "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n", | |
| "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n", | |
| "views_2_upper = np.quantile(views_2_sample, significance / 2)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 1 - significance / 2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " \n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = interest_variable - expected_views_per_day\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "plt.show()\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['Raw']['Negative Binomial with Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['Raw']['Negative Binomial with Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Views" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_csv(\"pbs_data.csv\")\n", | |
| "\n", | |
| "for _ in range(3):\n", | |
| " index = df['views_count'].idxmax()\n", | |
| " df = df.drop(index)\n", | |
| "\n", | |
| "datetime_full = pd.to_datetime(df['date'])\n", | |
| "df['year'] = datetime_full.apply(lambda x:x.year)\n", | |
| "df['month'] = datetime_full.apply(lambda x:x.month)\n", | |
| "df['day'] = datetime_full.apply(lambda x:x.day)\n", | |
| "\n", | |
| "df = df.sort_values(['year', 'month', 'day'])\n", | |
| "df = df.reset_index(drop=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "interest_variable = df['views_count']\n", | |
| "n_count_data = len(df)\n", | |
| "variable = tt.as_tensor_variable(interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Poisson without Outlier" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_views_poisson_without_outliers:\n", | |
| " \n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1_mean = variable[:tau].mean()\n", | |
| " mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n", | |
| " \n", | |
| " mu2_mean = variable[tau:].mean()\n", | |
| " mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = pm.math.switch(tau > idx, mu1, mu2)\n", | |
| "\n", | |
| " pm.Poisson(\"x\", switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_views_poisson_without_outliers:\n", | |
| " db = pm.backends.Text('poisson_without_outliers')\n", | |
| " step = pm.Metropolis()\n", | |
| " trace_views_poisson_without_outliers = pm.sample(15000, tune=25000, chains=20, cores=4, step=step, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_poisson_without_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_views_poisson_without_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_poisson_without_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_poisson_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_poisson_without_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_poisson_without_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_poisson_without_outliers['mu_2']\n", | |
| "tau_samples = trace_views_poisson_without_outliers['tau']\n", | |
| "\n", | |
| "all_data['pwoo_mu1'] = mu_1_samples\n", | |
| "all_data['pwoo_mu2'] = mu_2_samples\n", | |
| "all_data['pwoo_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_1$\")\n", | |
| "\n", | |
| "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper center\", ncol=2)\n", | |
| "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "views_1_sample = np.random.poisson(mu_1_samples)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "views_2_sample = np.random.poisson(mu_2_samples)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = interest_variable - expected_views_per_day\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['Raw']['Poisson without Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['Raw']['Poisson without Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Negative Binomial without Outliers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_views_nbinomial_without_outliers:\n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1 = variable[tau:].mean()\n", | |
| " sigma1_2 = variable[tau:].var()\n", | |
| " alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n", | |
| " \n", | |
| " mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n", | |
| " alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n", | |
| " \n", | |
| " mu2 = variable[:tau].mean()\n", | |
| " sigma2_2 = variable[:tau].var()\n", | |
| " alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n", | |
| " \n", | |
| " mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n", | |
| " alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = tau > idx\n", | |
| " \n", | |
| " mu_switch = pm.math.switch(switch, mu1_, mu2_)\n", | |
| " alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n", | |
| "\n", | |
| " pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_views_nbinomial_without_outliers:\n", | |
| " db = pm.backends.Text('negative_binomial_without_outliers')\n", | |
| " trace_views_nbinomial_without_outliers = pm.sample(60000, tune=15000, chains=20, cores=4, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model:2.2f}s aprox {round((time.perf_counter() - before_model) / 60):2.2f}min\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_nbinomial_without_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_views_nbinomial_without_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_nbinomial_without_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_nbinomial_without_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_nbinomial_without_outliers['mu_2']\n", | |
| "alpha_1_samples = trace_views_nbinomial_without_outliers['alpha_1']\n", | |
| "alpha_2_samples = trace_views_nbinomial_without_outliers['alpha_2']\n", | |
| "tau_samples = trace_views_nbinomial_without_outliers['tau']\n", | |
| "\n", | |
| "all_data['nbwoo_mu1'] = mu_1_samples\n", | |
| "all_data['nbwoo_mu2'] = mu_2_samples\n", | |
| "all_data['nbwoo_alpha1'] = alpha_1_samples\n", | |
| "all_data['nbwoo_alpha2'] = alpha_2_samples\n", | |
| "all_data['nbwoo_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_1$\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_1$\")\n", | |
| "\n", | |
| "ax = axs[4]\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_2$\")\n", | |
| "\n", | |
| "ax = axs[5]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[6]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = all_data['nbwoo_mu1']\n", | |
| "mu_2_samples = all_data['nbwoo_mu2']\n", | |
| "tau_samples = all_data['nbwoo_tau']\n", | |
| "alpha_1_samples = all_data['nbwoo_alpha1']\n", | |
| "alpha_2_samples = all_data['nbwoo_alpha2']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import seaborn as sns" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, ax = plt.subplots(figsize=(16, 5))\n", | |
| "\n", | |
| "values = np.array(tau_samples, dtype=float)\n", | |
| "bins = np.arange(values.min(), values.max() + 1.5) - 0.5\n", | |
| "ax.hist(values, bins=bins, rwidth=0.9, density=True, label='τ Posterior density')\n", | |
| "plt.xlim(29, 48)\n", | |
| "plt.xticks(np.arange(29, 49, 1))\n", | |
| "\n", | |
| "sns.kdeplot(values, lw=3, bw=0.75, gridsize=200, linestyle=\"--\", color=\"#333333\", label=\"KDE for τ\")\n", | |
| "\n", | |
| "ax.set_axisbelow(True)\n", | |
| "\n", | |
| "plt.legend(ncol=2)\n", | |
| "plt.title(\"Posterior Distribution for Tau\")\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Probability\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, ax = plt.subplots(figsize=(16, 5))\n", | |
| "plt.hist(mu_1_samples, bins=1400, alpha=0.85, label=f\"Posterior Distribution of $\\mu_1$\", density=True)\n", | |
| "plt.hist(mu_2_samples, bins=800, alpha=0.85, label=f\"Posterior Distribution of $\\mu_2$\", density=True)\n", | |
| "plt.legend(loc=\"upper right\", ncol=2)\n", | |
| "plt.xlabel(\"Number of Views (x $100.000$)\")\n", | |
| "plt.ylabel(\"Probability (x $10^{-5}$)\")\n", | |
| "plt.title(f\"Posterior Distribution Comparison\")\n", | |
| "plt.xlim(6e4, 22e4)\n", | |
| "plt.xticks(np.arange(6e4, 22.5e4, 5e3))\n", | |
| "ax.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0, 0))\n", | |
| "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n", | |
| "# plt.yscale('log')\n", | |
| "\n", | |
| "ax.set_axisbelow(True)\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n", | |
| "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n", | |
| "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "fig, ax = plt.subplots(figsize=(16, 5))\n", | |
| "\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " \n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "ax.plot(range(1, n_count_data + 1), expected_views_per_day, lw=3, color=\"#404045\",\n", | |
| " label=\"Expected Views\", ls=\"-.\")\n", | |
| "\n", | |
| "ax.plot(range(1, n_count_data + 1), expected_views_upper, lw=2, color=\"#7A68A6\", ls=\"--\")\n", | |
| "\n", | |
| "ax.plot(range(1, n_count_data + 1), expected_views_lower, lw=2, color=\"#7A68A6\", ls=\"--\",\n", | |
| " label=\"95% Probabiliy Density\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 1) + 1)\n", | |
| "plt.xlim(0.8, len(df) + 0.1)\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "\n", | |
| "plt.ylim(0, 3.8e5)\n", | |
| "plt.ylabel(\"Views (x $10.000$)\")\n", | |
| "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n", | |
| "\n", | |
| "\n", | |
| "ax.axvline(37, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4)\n", | |
| "ax.axvline(42, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4, label=\"Modes of τ\")\n", | |
| "\n", | |
| "ax.fill_betweenx(y=[0, 3.8e5], x1=33, x2=45, alpha=0.08, color='#009E73', label=\"95% HPD for τ\")\n", | |
| "\n", | |
| "plt.scatter(np.arange(len(interest_variable))+1, interest_variable, color=\"#348ABD\", label=\"Views\")\n", | |
| "\n", | |
| "handles, labels = ax.get_legend_handles_labels()\n", | |
| "labels_sorted = [labels[0], labels[2], labels[4], labels[1], labels[3]]\n", | |
| "handles_sorted = [handles[0], handles[2], handles[4], handles[1], handles[3]]\n", | |
| "\n", | |
| "plt.title(\"Posterior Predictive Distribution of Views\")\n", | |
| "plt.legend(handles_sorted, labels_sorted, loc=\"upper right\", ncol=2)\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "scale = 1.2\n", | |
| "plt.figure(figsize=(14.28 * scale, 6 * scale))\n", | |
| "\n", | |
| "ax = plt.subplot(212)\n", | |
| "\n", | |
| "ax.plot(range(1, n_count_data + 1), expected_views_per_day, lw=3, color=\"#404045\",\n", | |
| " label=\"Expected Views\", ls=\"-.\")\n", | |
| "\n", | |
| "ax.plot(range(1, n_count_data + 1), expected_views_upper, lw=2, color=\"#7A68A6\", ls=\"--\")\n", | |
| "\n", | |
| "ax.plot(range(1, n_count_data + 1), expected_views_lower, lw=2, color=\"#7A68A6\", ls=\"--\",\n", | |
| " label=\"95% Probabiliy Density\")\n", | |
| "\n", | |
| "ax.set_xticks(np.arange(0, len(df), 1) + 1)\n", | |
| "ax.set_xlim(0.8, len(df) + 0.1)\n", | |
| "ax.set_xlabel(\"Video ID\")\n", | |
| "\n", | |
| "ax.set_ylim(0, 3.8e5)\n", | |
| "ax.set_ylabel(\"Views (x $100.000$)\")\n", | |
| "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n", | |
| "\n", | |
| "ax.axvline(37, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4)\n", | |
| "ax.axvline(42, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4, label=\"Modes of τ\")\n", | |
| "\n", | |
| "ax.fill_betweenx(y=[0, 3.8e5], x1=35, x2=45, alpha=0.08, color='#009E73', label=\"95% HPD for τ\")\n", | |
| "\n", | |
| "ax.scatter(np.arange(len(interest_variable))+1, interest_variable, color=\"#348ABD\", label=\"Views\")\n", | |
| "\n", | |
| "handles, labels = ax.get_legend_handles_labels()\n", | |
| "labels_sorted = [labels[0], labels[2], labels[4], labels[1], labels[3]]\n", | |
| "handles_sorted = [handles[0], handles[2], handles[4], handles[1], handles[3]]\n", | |
| "\n", | |
| "ax.set_title(\"Posterior Predictive Distribution of Views\")\n", | |
| "ax.legend(handles_sorted, labels_sorted, loc=\"upper right\", ncol=2, prop={'size': 11})\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "ax = plt.subplot(221)\n", | |
| "\n", | |
| "ax.hist(mu_1_samples, bins=1400, label=f\"$\\mu_1$ Posterior Density\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=800, label=f\"$\\mu_2$ Posterior Density\", density=True)\n", | |
| "\n", | |
| "ax.set_xlim(6e4, 22e4)\n", | |
| "ax.set_xticks(np.arange(6e4, 22.5e4, 1e4))\n", | |
| "ax.set_xlabel(\"Number of Views (x $100.000$)\")\n", | |
| "ax.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0, 0))\n", | |
| "\n", | |
| "ax.set_ylim(0, 5.4e-5)\n", | |
| "ax.set_yticks(np.arange(0, 5.5e-5, 5e-6))\n", | |
| "ax.set_ylabel(\"Probability (x $10^{-5}$)\")\n", | |
| "ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n", | |
| "\n", | |
| "ax.set_axisbelow(True)\n", | |
| "ax.set_title(f\"Likelihood Parameters Posteriors\")\n", | |
| "ax.legend(loc=\"upper right\", prop={'size': 11})\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "ax = plt.subplot(222)\n", | |
| "\n", | |
| "values = np.array(tau_samples, dtype=float)\n", | |
| "bins = np.arange(values.min(), values.max() + 1.5) - 0.5\n", | |
| "ax.hist(values, bins=bins, rwidth=0.9, density=True, label='τ Posterior Density')\n", | |
| "sns.kdeplot(values, lw=3, bw=0.75, gridsize=200, linestyle=\"--\", color=\"#333333\", label=\"KDE for τ\")\n", | |
| "\n", | |
| "ax.set_xlim(29, 48)\n", | |
| "ax.set_xticks(np.arange(29, 49, 1))\n", | |
| "ax.set_xlabel(\"Video ID\")\n", | |
| "\n", | |
| "ax.set_ylim(0, 0.195)\n", | |
| "ax.set_yticks(np.arange(0, 0.2, 0.02))\n", | |
| "ax.set_ylabel(\"Probability (x $10^{-1}$)\")\n", | |
| "ax.ticklabel_format(axis='y', style='sci', scilimits=(0,0))\n", | |
| "\n", | |
| "ax.set_axisbelow(True)\n", | |
| "ax.set_title(\"Change Point Posterior\")\n", | |
| "handles, labels = ax.get_legend_handles_labels()\n", | |
| "ax.legend(handles[::-1], labels[::-1], loc=\"upper right\", prop={'size': 11})\n", | |
| "\n", | |
| "plt.suptitle('Posterior and Posterior Predictive Distributions', fontsize=16)\n", | |
| "plt.tight_layout()\n", | |
| "plt.subplots_adjust(top=0.91)\n", | |
| "plt.savefig(\"posterior.pdf\")\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "resid = interest_variable - expected_views_per_day\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['Raw']['Negative Binomial without Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['Raw']['Negative Binomial without Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "normal_resid" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "all_data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# SQRT Views" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_csv(\"pbs_data.csv\")\n", | |
| "\n", | |
| "datetime_full = pd.to_datetime(df['date'])\n", | |
| "df['year'] = datetime_full.apply(lambda x:x.year)\n", | |
| "df['month'] = datetime_full.apply(lambda x:x.month)\n", | |
| "df['day'] = datetime_full.apply(lambda x:x.day)\n", | |
| "\n", | |
| "df = df.sort_values(['year', 'month', 'day'])\n", | |
| "df = df.reset_index(drop=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "interest_variable = np.sqrt(df['views_count'])\n", | |
| "n_count_data = len(df)\n", | |
| "variable = tt.as_tensor_variable(interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Poisson with Outliers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_view_poisson_with_outliers:\n", | |
| " \n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1_mean = variable[:tau].mean()\n", | |
| " mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n", | |
| " \n", | |
| " mu2_mean = variable[tau:].mean()\n", | |
| " mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = pm.math.switch(tau > idx, mu1, mu2)\n", | |
| "\n", | |
| " pm.Poisson(\"x\", switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_view_poisson_with_outliers:\n", | |
| " db = pm.backends.Text('poisson_sqrt_with_outliers')\n", | |
| " trace_views_poisson_with_outliers = pm.sample(draws=200000, tune=500000, chains=20, cores=4, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "duration = time.perf_counter() - before_model\n", | |
| "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_poisson_with_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_view_poisson_with_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_poisson_with_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_poisson_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_poisson_with_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_poisson_with_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_poisson_with_outliers['mu_2']\n", | |
| "tau_samples = trace_views_poisson_with_outliers['tau']\n", | |
| "\n", | |
| "all_data['pwo_sqrt_mu1'] = mu_1_samples\n", | |
| "all_data['pwo_sqrt_mu2'] = mu_2_samples\n", | |
| "all_data['pwo_sqrt_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n", | |
| "\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_1$\")\n", | |
| "\n", | |
| "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper center\", ncol=2)\n", | |
| "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "views_1_sample = np.random.poisson(mu_1_samples)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "views_2_sample = np.random.poisson(mu_2_samples)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)\n", | |
| "\n", | |
| "views_1_upper, views_1_lower, views_2_upper, views_2_lower" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "plt.show()\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['SQRT']['Poisson with Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['SQRT']['Poisson with Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Negative Binomial with Outlier" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_views_nbinomial_with_outliers:\n", | |
| " \n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1 = variable[tau:].mean()\n", | |
| " sigma1_2 = variable[tau:].var()\n", | |
| " alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n", | |
| " \n", | |
| " mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n", | |
| " alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n", | |
| " \n", | |
| " mu2 = variable[:tau].mean()\n", | |
| " sigma2_2 = variable[:tau].var()\n", | |
| " alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n", | |
| " \n", | |
| " mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n", | |
| " alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = tau > idx\n", | |
| " \n", | |
| " mu_switch = pm.math.switch(switch, mu1_, mu2_)\n", | |
| " alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n", | |
| "\n", | |
| " pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_views_nbinomial_with_outliers:\n", | |
| " db = pm.backends.Text('negative_binomial_sqrt_with_outliers')\n", | |
| " trace_views_nbinomial_with_outliers = pm.sample(tune=80000, chains=20, cores=4, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "duration = time.perf_counter() - before_model\n", | |
| "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_nbinomial_with_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_views_nbinomial_with_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_nbinomial_with_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_nbinomial_with_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_nbinomial_with_outliers['mu_2']\n", | |
| "alpha_1_samples = trace_views_nbinomial_with_outliers['alpha_1']\n", | |
| "alpha_2_samples = trace_views_nbinomial_with_outliers['alpha_2']\n", | |
| "tau_samples = trace_views_nbinomial_with_outliers['tau']\n", | |
| "\n", | |
| "all_data['nbwo_sqrt_mu1'] = mu_1_samples\n", | |
| "all_data['nbwo_sqrt_mu2'] = mu_2_samples\n", | |
| "all_data['nbwo_sqrt_alpha1'] = alpha_1_samples\n", | |
| "all_data['nbwo_sqrt_alpha2'] = alpha_2_samples\n", | |
| "all_data['nbwo_sqrt_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_1$\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_1$\")\n", | |
| "\n", | |
| "ax = axs[4]\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_2$\")\n", | |
| "\n", | |
| "ax = axs[5]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[6]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n", | |
| "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n", | |
| "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " \n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "plt.show()\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['SQRT']['Negative Binomial with Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['SQRT']['Negative Binomial with Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Views" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_csv(\"pbs_data.csv\")\n", | |
| "\n", | |
| "for _ in range(3):\n", | |
| " index = df['views_count'].idxmax()\n", | |
| " df = df.drop(index)\n", | |
| "\n", | |
| "datetime_full = pd.to_datetime(df['date'])\n", | |
| "df['year'] = datetime_full.apply(lambda x:x.year)\n", | |
| "df['month'] = datetime_full.apply(lambda x:x.month)\n", | |
| "df['day'] = datetime_full.apply(lambda x:x.day)\n", | |
| "\n", | |
| "df = df.sort_values(['year', 'month', 'day'])\n", | |
| "df = df.reset_index(drop=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "interest_variable = np.sqrt(df['views_count'])\n", | |
| "n_count_data = len(df)\n", | |
| "variable = tt.as_tensor_variable(interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Poisson without Outlier" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_views_poisson_without_outliers:\n", | |
| " \n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1_mean = variable[:tau].mean()\n", | |
| " mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n", | |
| " \n", | |
| " mu2_mean = variable[tau:].mean()\n", | |
| " mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = pm.math.switch(tau > idx, mu1, mu2)\n", | |
| "\n", | |
| " pm.Poisson(\"x\", switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_views_poisson_without_outliers:\n", | |
| " db = pm.backends.Text('poisson_sqrt_without_outliers')\n", | |
| " trace_views_poisson_without_outliers = pm.sample(tune=20000, chains=20, cores=4, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "duration = time.perf_counter() - before_model\n", | |
| "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_poisson_without_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.model_to_graphviz(model_views_poisson_without_outliers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_poisson_without_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_poisson_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_poisson_without_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_poisson_without_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_poisson_without_outliers['mu_2']\n", | |
| "tau_samples = trace_views_poisson_without_outliers['tau']\n", | |
| "\n", | |
| "all_data['pwoo_sqrt_mu1'] = mu_1_samples\n", | |
| "all_data['pwoo_sqrt_mu2'] = mu_2_samples\n", | |
| "all_data['pwoo_sqrt_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_1$\")\n", | |
| "\n", | |
| "plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n", | |
| "ax.legend(loc=\"upper center\", ncol=2)\n", | |
| "ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "views_1_sample = np.random.poisson(mu_1_samples)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "views_2_sample = np.random.poisson(mu_2_samples)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['SQRT']['Poisson without Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['SQRT']['Poisson without Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Negative Binomial without Outliers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "before_model = time.perf_counter()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with pm.Model() as model_views_nbinomial_without_outliers:\n", | |
| " \n", | |
| " tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n", | |
| " \n", | |
| " mu1 = variable[tau:].mean()\n", | |
| " sigma1_2 = variable[tau:].var()\n", | |
| " alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n", | |
| " \n", | |
| " mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n", | |
| " alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n", | |
| " \n", | |
| " mu2 = variable[:tau].mean()\n", | |
| " sigma2_2 = variable[:tau].var()\n", | |
| " alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n", | |
| " \n", | |
| " mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n", | |
| " alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n", | |
| " \n", | |
| " idx = np.arange(len(df))\n", | |
| " \n", | |
| " switch = tau > idx\n", | |
| " \n", | |
| " mu_switch = pm.math.switch(switch, mu1_, mu2_)\n", | |
| " alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n", | |
| "\n", | |
| " pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with model_views_nbinomial_without_outliers:\n", | |
| " db = pm.backends.Text('negative_binomial_sqrt_without_outliers')\n", | |
| " trace_views_nbinomial_without_outliers = pm.sample(tune=20000, chains=20, cores=4, trace=db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "duration = time.perf_counter() - before_model\n", | |
| "print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pm.summary(trace_views_nbinomial_without_outliers).round(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['tau'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = az.from_pymc3(\n", | |
| " trace=trace_views_nbinomial_without_outliers,\n", | |
| ")\n", | |
| "az.plot_trace(data);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mu_1_samples = trace_views_nbinomial_without_outliers['mu_1']\n", | |
| "mu_2_samples = trace_views_nbinomial_without_outliers['mu_2']\n", | |
| "alpha_1_samples = trace_views_nbinomial_without_outliers['alpha_1']\n", | |
| "alpha_2_samples = trace_views_nbinomial_without_outliers['alpha_2']\n", | |
| "tau_samples = trace_views_nbinomial_without_outliers['tau']\n", | |
| "\n", | |
| "all_data['nbwoo_sqrt_mu1'] = mu_1_samples\n", | |
| "all_data['nbwoo_sqrt_mu2'] = mu_2_samples\n", | |
| "all_data['nbwoo_sqrt_alpha1'] = alpha_1_samples\n", | |
| "all_data['nbwoo_sqrt_alpha2'] = alpha_2_samples\n", | |
| "all_data['nbwoo_sqrt_tau'] = tau_samples\n", | |
| "\n", | |
| "with open('all_data.p', 'wb') as handle:\n", | |
| " pickle.dump(all_data, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n", | |
| "\n", | |
| "ax = axs[0]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_1$\")\n", | |
| "\n", | |
| "ax = axs[1]\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$mu_2$\")\n", | |
| "\n", | |
| "ax = axs[2]\n", | |
| "ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n", | |
| "ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[3]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_1$\")\n", | |
| "\n", | |
| "ax = axs[4]\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(\"$alpha_2$\")\n", | |
| "\n", | |
| "ax = axs[5]\n", | |
| "ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n", | |
| "ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "\n", | |
| "ax = axs[6]\n", | |
| "w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n", | |
| "ax.hist(tau_samples, bins=n_count_data, alpha=1,\n", | |
| " label=r\"posterior of $\\tau$\",\n", | |
| " color=\"#467821\", weights=w)\n", | |
| "ax.legend(loc=\"upper left\")\n", | |
| "ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n", | |
| "\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n", | |
| "views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n", | |
| "views_1_upper = np.quantile(views_1_sample, 0.025)\n", | |
| "views_1_lower = np.quantile(views_1_sample, 0.975)\n", | |
| "\n", | |
| "p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n", | |
| "views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n", | |
| "views_2_upper = np.quantile(views_2_sample, 0.025)\n", | |
| "views_2_lower = np.quantile(views_2_sample, 0.975)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " \n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Residuals Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "plt.scatter(np.arange(len(df)), resid)\n", | |
| "plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n", | |
| "print(f\"MSE: {np.power(resid, 2).mean():4g}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "plt.figure(figsize=(16, 5))\n", | |
| "x, y = probplot(resid, plot=plt);\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()\n", | |
| "_, p = normaltest(resid)\n", | |
| "print(f\"P-Value for Normal Test: {p:.4g}\")\n", | |
| "if p > 1e-3:\n", | |
| " print(\"Suggest that the residuals are normaly distributed\")\n", | |
| "else:\n", | |
| " print(\"Suggest that the residuals are NOT normaly distributed\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse['SQRT']['Negative Binomial without Outlier'] = np.power(resid, 2).mean()\n", | |
| "normal_resid['SQRT']['Negative Binomial without Outlier'] = p\n", | |
| "\n", | |
| "with open('mse.p', 'wb') as handle:\n", | |
| " pickle.dump(mse, handle)\n", | |
| "\n", | |
| "with open('normal_resid.p', 'wb') as handle:\n", | |
| " pickle.dump(normal_resid, handle)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Showing all Results" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "all_data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "mse_scaled = mse ** 0.5\n", | |
| "mse_scaled = mse_scaled.drop('LOG', axis=1)\n", | |
| "mse_scaled /= mse_scaled.min().min()\n", | |
| "mse_scaled = mse_scaled.astype(float).round(2)\n", | |
| "mse_scaled" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "normal_resid" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "normal_resid_scale = normal_resid.copy()\n", | |
| "normal_resid_scale = normal_resid_scale.drop('LOG', axis=1)\n", | |
| "normal_resid_scale *= 100\n", | |
| "normal_resid_scale > 0.5\n", | |
| "normal_resid_scale" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "elapsed_time = time.perf_counter() - start_running\n", | |
| "print(f\"Total elapsed time: {elapsed_time:5.2f} s\")\n", | |
| "print(f\"Total elapsed time: {int((elapsed_time / 60) % 60):5.0f} min\")\n", | |
| "print(f\"Total elapsed time: {int((elapsed_time / 60 /60) % 24):5.0f} hs\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "N = tau_samples.shape[0]\n", | |
| "expected_views_per_day = np.zeros(n_count_data)\n", | |
| "expected_views_upper = np.zeros(n_count_data)\n", | |
| "expected_views_lower = np.zeros(n_count_data)\n", | |
| "\n", | |
| "plt.figure(figsize=(16, 5))\n", | |
| "\n", | |
| "for day in range(0, n_count_data):\n", | |
| " ix = day < tau_samples\n", | |
| " \n", | |
| " expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n", | |
| " expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n", | |
| " expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n", | |
| " label=\"expected number of text-messages received\")\n", | |
| "\n", | |
| "plt.xticks(np.arange(0, len(df), 2))\n", | |
| "plt.xlim(-1, len(df))\n", | |
| "plt.xlabel(\"Video ID\")\n", | |
| "plt.ylabel(\"Views\")\n", | |
| "plt.title(\"Expected number of views per video\")\n", | |
| "plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n", | |
| "\n", | |
| "plt.legend(loc=\"upper left\")\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Execution Time" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "elapsed_time = time.perf_counter() - start_running\n", | |
| "print(f\"Total elapsed time: {elapsed_time:5.2f} s\")\n", | |
| "print(f\"Total elapsed time: {int((elapsed_time / 60) % 60):5.0f} min\")\n", | |
| "print(f\"Total elapsed time: {int((elapsed_time / 60 /60) % 24):5.0f} hs\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.7" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| views_count | date | likes | dislikes | comments_count | |
|---|---|---|---|---|---|
| 118579 | 17 May 2018 | 4243 | 675 | 1290 | |
| 102081 | 17 May 2018 | 1731 | 143 | 375 | |
| 54220 | 26 Apr 2018 | 1415 | 65 | 188 | |
| 60052 | 13 Apr 2018 | 1490 | 37 | 141 | |
| 37976 | 29 Mar 2018 | 1569 | 42 | 167 | |
| 98477 | 23 Mar 2018 | 2471 | 69 | 784 | |
| 43906 | 15 Mar 2018 | 1080 | 51 | 208 | |
| 82826 | 8 Mar 2018 | 2412 | 105 | 398 | |
| 77348 | 1 Mar 2018 | 2257 | 26 | 377 | |
| 82363 | 27 Feb 2018 | 3229 | 49 | 608 | |
| 77653 | 15 Feb 2018 | 2008 | 46 | 278 | |
| 133253 | 1 Feb 2018 | 3354 | 186 | 376 | |
| 140062 | 25 Jan 2018 | 4496 | 90 | 639 | |
| 69011 | 18 Jan 2018 | 1869 | 344 | 315 | |
| 60526 | 11 Jan 2018 | 2126 | 34 | 187 | |
| 67910 | 21 Dec 2017 | 1974 | 267 | 359 | |
| 63290 | 14 Dec 2017 | 2872 | 34 | 249 | |
| 84684 | 12 Dec 2017 | 2798 | 49 | 330 | |
| 70405 | 30 Nov 2017 | 2721 | 120 | 404 | |
| 104901 | 23 Nov 2017 | 4251 | 201 | 613 | |
| 139189 | 17 Nov 2017 | 6433 | 95 | 954 | |
| 114225 | 10 Nov 2017 | 3171 | 86 | 262 | |
| 703089 | 19 Oct 2017 | 17149 | 991 | 2374 | |
| 144368 | 12 Oct 2017 | 4645 | 75 | 547 | |
| 69256 | 6 Oct 2017 | 1845 | 23 | 207 | |
| 101029 | 28 Sep 2017 | 2907 | 58 | 319 | |
| 127403 | 21 Sep 2017 | 3748 | 80 | 540 | |
| 139737 | 14 Sep 2017 | 3635 | 81 | 684 | |
| 226200 | 24 Aug 2017 | 6287 | 130 | 569 | |
| 102046 | 23 Aug 2017 | 2929 | 44 | 275 | |
| 137518 | 22 Aug 2017 | 3799 | 43 | 259 | |
| 88791 | 10 Aug 2017 | 2985 | 190 | 659 | |
| 265561 | 3 Aug 2017 | 8735 | 197 | 768 | |
| 346312 | 21 Jul 2017 | 9094 | 644 | 1036 | |
| 114411 | 13 Jul 2017 | 2824 | 63 | 392 | |
| 83106 | 29 Jun 2017 | 2149 | 45 | 351 | |
| 122523 | 22 Jun 2017 | 2359 | 109 | 607 | |
| 106887 | 15 Jun 2017 | 3910 | 45 | 601 | |
| 92765 | 8 Jun 2017 | 3078 | 90 | 420 | |
| 98660 | 1 Jun 2017 | 4007 | 46 | 402 | |
| 222122 | 19 May 2017 | 5939 | 188 | 889 | |
| 126756 | 11 May 2017 | 3710 | 60 | 487 | |
| 103940 | 4 May 2017 | 2810 | 68 | 413 | |
| 198880 | 27 Apr 2017 | 4721 | 83 | 418 | |
| 210136 | 20 Apr 2017 | 5472 | 104 | 480 | |
| 193191 | 13 Apr 2017 | 4576 | 158 | 919 | |
| 23375 | 10 Apr 2017 | 1415 | 62 | 275 | |
| 89182 | 6 Apr 2017 | 2375 | 53 | 386 | |
| 174447 | 23 Mar 2017 | 4461 | 98 | 488 | |
| 91416 | 16 Mar 2017 | 2561 | 53 | 377 | |
| 168412 | 9 Mar 2017 | 5278 | 110 | 662 | |
| 312267 | 2 Mar 2017 | 6188 | 419 | 1156 | |
| 72497 | 23 Feb 2017 | 2103 | 57 | 284 | |
| 566168 | 16 Feb 2017 | 14809 | 336 | 1377 | |
| 158059 | 2 Feb 2017 | 3683 | 53 | 546 | |
| 241847 | 26 Jan 2017 | 6259 | 185 | 997 | |
| 313571 | 19 Jan 2017 | 7921 | 140 | 980 | |
| 153137 | 12 Jan 2017 | 4617 | 59 | 606 | |
| 274378 | 5 Jan 2017 | 8077 | 404 | 1304 | |
| 143995 | 22 Dec 2016 | 3565 | 75 | 403 | |
| 169567 | 15 Dec 2016 | 4033 | 70 | 968 | |
| 212652 | 8 Dec 2016 | 5400 | 145 | 1134 | |
| 111195 | 1 Dec 2016 | 3529 | 46 | 548 | |
| 318748 | 24 Nov 2016 | 8585 | 352 | 1826 | |
| 872483 | 17 Nov 2016 | 23675 | 745 | 2391 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment