Skip to content

Instantly share code, notes, and snippets.

@ELC
Last active September 17, 2020 15:47
Show Gist options
  • Save ELC/6a0f1716bc39d0ef21285722f0c57bd3 to your computer and use it in GitHub Desktop.
Save ELC/6a0f1716bc39d0ef21285722f0c57bd3 to your computer and use it in GitHub Desktop.
EMSS2020
chromium-chromedriver
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
name: EMSS-Castano
channels:
- conda-forge
dependencies:
- python=3.7
- numpy
- pandas
- scipy
- seaborn
- matplotlib
- selenium
- tqdm
- pymc3
- mkl-service
- pip:
- helium
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import pickle\n",
"\n",
"import pymc3 as pm\n",
"import numpy as np\n",
"import pandas as pd\n",
"from scipy.stats import probplot, normaltest\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib as mpl\n",
"import arviz as az\n",
"import theano.tensor as tt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"start_running = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"plt.style.use('bmh')\n",
"mpl.rcParams['figure.figsize'] = (16, 10)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"pbs_data.csv\")\n",
"\n",
"datetime_full = pd.to_datetime(df['date'])\n",
"df['year'] = datetime_full.apply(lambda x:x.year)\n",
"df['month'] = datetime_full.apply(lambda x:x.month)\n",
"df['day'] = datetime_full.apply(lambda x:x.day)\n",
"\n",
"df = df.sort_values(['year', 'month', 'day'])\n",
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"columns = [\"Raw\", \"SQRT\"]\n",
"index = [\"Poisson with Outlier\", \"Negative Binomial with Outlier\",\n",
" \"Poisson without Outlier\", \"Negative Binomial without Outlier\"]\n",
"template = pd.DataFrame(columns=columns, index=index)\n",
"mse = template.copy()\n",
"normal_resid = template.copy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_data = {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" with open('mse.p', 'rb') as handle:\n",
" mse = pickle.load(handle)\n",
"except:\n",
" print(\"MSE ERROR\")\n",
" \n",
"try:\n",
" with open('normal_resid.p', 'rb') as handle:\n",
" normal_resid = pickle.load(handle)\n",
"except:\n",
" print(\"RESID ERROR\")\n",
" \n",
"try:\n",
" with open('all_data.p', 'rb') as handle:\n",
" all_data = pickle.load(handle)\n",
"except:\n",
" print(\"ALL DATA ERROR\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"normal_resid"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Views"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"interest_variable = df['views_count']\n",
"n_count_data = len(df)\n",
"variable = tt.as_tensor_variable(interest_variable)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Poisson with Outliers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_view_poisson_with_outliers:\n",
" \n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1_mean = variable[:tau].mean()\n",
" mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
" \n",
" mu2_mean = variable[tau:].mean()\n",
" mu2 = pm.Exponential(\"mu_2\", 1 / mu2_mean)\n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = pm.math.switch(tau > idx, mu1, mu2)\n",
"\n",
" pm.Poisson(\"x\", switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_view_poisson_with_outliers:\n",
" db = pm.backends.Text('poisson_with_outliers')\n",
" trace_views_poisson_with_outliers = pm.sample(10000, tune=15000, chains=20, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_poisson_with_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_view_poisson_with_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_poisson_with_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_poisson_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_poisson_with_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_poisson_with_outliers['mu_1']\n",
"mu_2_samples = trace_views_poisson_with_outliers['mu_2']\n",
"tau_samples = trace_views_poisson_with_outliers['tau']\n",
"\n",
"all_data['pwo_mu1'] = mu_1_samples\n",
"all_data['pwo_mu2'] = mu_2_samples\n",
"all_data['pwo_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
"\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_1$\")\n",
"\n",
"plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper center\", ncol=2)\n",
"ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
"\n",
"ax = axs[3]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"views_1_sample = np.random.poisson(mu_1_samples)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"views_2_sample = np.random.poisson(mu_2_samples)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)\n",
"\n",
"views_1_upper, views_1_lower, views_2_upper, views_2_lower"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n",
" label=\"Expected Number of Views\")\n",
"\n",
"plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"Views\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = interest_variable - expected_views_per_day\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"plt.show()\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['Raw']['Poisson with Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['Raw']['Poisson with Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Negative Binomial with Outlier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_views_nbinomial_with_outliers:\n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1 = variable[tau:].mean()\n",
" sigma1_2 = variable[tau:].var()\n",
" alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
" \n",
" mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
" alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
" \n",
" mu2 = variable[:tau].mean()\n",
" sigma2_2 = variable[:tau].var()\n",
" alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
" \n",
" mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
" alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = tau > idx\n",
" \n",
" mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
" alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
"\n",
" pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with model_views_nbinomial_with_outliers:\n",
" db = pm.backends.Text('negative_binomial_with_outliers')\n",
" trace_views_nbinomial_with_outliers = pm.sample(10000, tune=25000, chains=20, cores=4, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_nbinomial_with_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_views_nbinomial_with_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
"\n",
"ax = axs[2]\n",
"pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_nbinomial_with_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_nbinomial_with_outliers['mu_1']\n",
"mu_2_samples = trace_views_nbinomial_with_outliers['mu_2']\n",
"alpha_1_samples = trace_views_nbinomial_with_outliers['alpha_1']\n",
"alpha_2_samples = trace_views_nbinomial_with_outliers['alpha_2']\n",
"tau_samples = trace_views_nbinomial_with_outliers['tau']\n",
"\n",
"all_data['nbwo_mu1'] = mu_1_samples\n",
"all_data['nbwo_mu2'] = mu_2_samples\n",
"all_data['nbwo_alpha1'] = alpha_1_samples\n",
"all_data['nbwo_alpha2'] = alpha_2_samples\n",
"all_data['nbwo_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_1$\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[3]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_1$\")\n",
"\n",
"ax = axs[4]\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_2$\")\n",
"\n",
"ax = axs[5]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[6]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"significance = 0.05\n",
"p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
"views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
"views_1_upper = np.quantile(views_1_sample, significance / 2)\n",
"views_1_lower = np.quantile(views_1_sample, 1 - significance / 2)\n",
"\n",
"p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
"views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
"views_2_upper = np.quantile(views_2_sample, significance / 2)\n",
"views_2_lower = np.quantile(views_2_sample, 1 - significance / 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" \n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = interest_variable - expected_views_per_day\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"plt.show()\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['Raw']['Negative Binomial with Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['Raw']['Negative Binomial with Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Views"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"pbs_data.csv\")\n",
"\n",
"for _ in range(3):\n",
" index = df['views_count'].idxmax()\n",
" df = df.drop(index)\n",
"\n",
"datetime_full = pd.to_datetime(df['date'])\n",
"df['year'] = datetime_full.apply(lambda x:x.year)\n",
"df['month'] = datetime_full.apply(lambda x:x.month)\n",
"df['day'] = datetime_full.apply(lambda x:x.day)\n",
"\n",
"df = df.sort_values(['year', 'month', 'day'])\n",
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"interest_variable = df['views_count']\n",
"n_count_data = len(df)\n",
"variable = tt.as_tensor_variable(interest_variable)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Poisson without Outlier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_views_poisson_without_outliers:\n",
" \n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1_mean = variable[:tau].mean()\n",
" mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
" \n",
" mu2_mean = variable[tau:].mean()\n",
" mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = pm.math.switch(tau > idx, mu1, mu2)\n",
"\n",
" pm.Poisson(\"x\", switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_views_poisson_without_outliers:\n",
" db = pm.backends.Text('poisson_without_outliers')\n",
" step = pm.Metropolis()\n",
" trace_views_poisson_without_outliers = pm.sample(15000, tune=25000, chains=20, cores=4, step=step, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_poisson_without_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_views_poisson_without_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_poisson_without_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_poisson_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_poisson_without_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_poisson_without_outliers['mu_1']\n",
"mu_2_samples = trace_views_poisson_without_outliers['mu_2']\n",
"tau_samples = trace_views_poisson_without_outliers['tau']\n",
"\n",
"all_data['pwoo_mu1'] = mu_1_samples\n",
"all_data['pwoo_mu2'] = mu_2_samples\n",
"all_data['pwoo_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_1$\")\n",
"\n",
"plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper center\", ncol=2)\n",
"ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
"\n",
"ax = axs[3]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"views_1_sample = np.random.poisson(mu_1_samples)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"views_2_sample = np.random.poisson(mu_2_samples)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), expected_views_per_day, lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), expected_views_upper, lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), expected_views_lower, lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), interest_variable, color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = interest_variable - expected_views_per_day\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['Raw']['Poisson without Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['Raw']['Poisson without Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Negative Binomial without Outliers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_views_nbinomial_without_outliers:\n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1 = variable[tau:].mean()\n",
" sigma1_2 = variable[tau:].var()\n",
" alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
" \n",
" mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
" alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
" \n",
" mu2 = variable[:tau].mean()\n",
" sigma2_2 = variable[:tau].var()\n",
" alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
" \n",
" mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
" alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = tau > idx\n",
" \n",
" mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
" alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
"\n",
" pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_views_nbinomial_without_outliers:\n",
" db = pm.backends.Text('negative_binomial_without_outliers')\n",
" trace_views_nbinomial_without_outliers = pm.sample(60000, tune=15000, chains=20, cores=4, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Model Definition and fitting lasted: {time.perf_counter() - before_model:2.2f}s aprox {round((time.perf_counter() - before_model) / 60):2.2f}min\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_nbinomial_without_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_views_nbinomial_without_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
"\n",
"ax = axs[2]\n",
"pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_nbinomial_without_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_nbinomial_without_outliers['mu_1']\n",
"mu_2_samples = trace_views_nbinomial_without_outliers['mu_2']\n",
"alpha_1_samples = trace_views_nbinomial_without_outliers['alpha_1']\n",
"alpha_2_samples = trace_views_nbinomial_without_outliers['alpha_2']\n",
"tau_samples = trace_views_nbinomial_without_outliers['tau']\n",
"\n",
"all_data['nbwoo_mu1'] = mu_1_samples\n",
"all_data['nbwoo_mu2'] = mu_2_samples\n",
"all_data['nbwoo_alpha1'] = alpha_1_samples\n",
"all_data['nbwoo_alpha2'] = alpha_2_samples\n",
"all_data['nbwoo_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_1$\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[3]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_1$\")\n",
"\n",
"ax = axs[4]\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_2$\")\n",
"\n",
"ax = axs[5]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[6]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = all_data['nbwoo_mu1']\n",
"mu_2_samples = all_data['nbwoo_mu2']\n",
"tau_samples = all_data['nbwoo_tau']\n",
"alpha_1_samples = all_data['nbwoo_alpha1']\n",
"alpha_2_samples = all_data['nbwoo_alpha2']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(16, 5))\n",
"\n",
"values = np.array(tau_samples, dtype=float)\n",
"bins = np.arange(values.min(), values.max() + 1.5) - 0.5\n",
"ax.hist(values, bins=bins, rwidth=0.9, density=True, label='τ Posterior density')\n",
"plt.xlim(29, 48)\n",
"plt.xticks(np.arange(29, 49, 1))\n",
"\n",
"sns.kdeplot(values, lw=3, bw=0.75, gridsize=200, linestyle=\"--\", color=\"#333333\", label=\"KDE for τ\")\n",
"\n",
"ax.set_axisbelow(True)\n",
"\n",
"plt.legend(ncol=2)\n",
"plt.title(\"Posterior Distribution for Tau\")\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Probability\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(16, 5))\n",
"plt.hist(mu_1_samples, bins=1400, alpha=0.85, label=f\"Posterior Distribution of $\\mu_1$\", density=True)\n",
"plt.hist(mu_2_samples, bins=800, alpha=0.85, label=f\"Posterior Distribution of $\\mu_2$\", density=True)\n",
"plt.legend(loc=\"upper right\", ncol=2)\n",
"plt.xlabel(\"Number of Views (x $100.000$)\")\n",
"plt.ylabel(\"Probability (x $10^{-5}$)\")\n",
"plt.title(f\"Posterior Distribution Comparison\")\n",
"plt.xlim(6e4, 22e4)\n",
"plt.xticks(np.arange(6e4, 22.5e4, 5e3))\n",
"ax.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0, 0))\n",
"ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
"# plt.yscale('log')\n",
"\n",
"ax.set_axisbelow(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
"views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
"views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"fig, ax = plt.subplots(figsize=(16, 5))\n",
"\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" \n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"ax.plot(range(1, n_count_data + 1), expected_views_per_day, lw=3, color=\"#404045\",\n",
" label=\"Expected Views\", ls=\"-.\")\n",
"\n",
"ax.plot(range(1, n_count_data + 1), expected_views_upper, lw=2, color=\"#7A68A6\", ls=\"--\")\n",
"\n",
"ax.plot(range(1, n_count_data + 1), expected_views_lower, lw=2, color=\"#7A68A6\", ls=\"--\",\n",
" label=\"95% Probabiliy Density\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 1) + 1)\n",
"plt.xlim(0.8, len(df) + 0.1)\n",
"plt.xlabel(\"Video ID\")\n",
"\n",
"plt.ylim(0, 3.8e5)\n",
"plt.ylabel(\"Views (x $10.000$)\")\n",
"ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
"\n",
"\n",
"ax.axvline(37, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4)\n",
"ax.axvline(42, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4, label=\"Modes of τ\")\n",
"\n",
"ax.fill_betweenx(y=[0, 3.8e5], x1=33, x2=45, alpha=0.08, color='#009E73', label=\"95% HPD for τ\")\n",
"\n",
"plt.scatter(np.arange(len(interest_variable))+1, interest_variable, color=\"#348ABD\", label=\"Views\")\n",
"\n",
"handles, labels = ax.get_legend_handles_labels()\n",
"labels_sorted = [labels[0], labels[2], labels[4], labels[1], labels[3]]\n",
"handles_sorted = [handles[0], handles[2], handles[4], handles[1], handles[3]]\n",
"\n",
"plt.title(\"Posterior Predictive Distribution of Views\")\n",
"plt.legend(handles_sorted, labels_sorted, loc=\"upper right\", ncol=2)\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scale = 1.2\n",
"plt.figure(figsize=(14.28 * scale, 6 * scale))\n",
"\n",
"ax = plt.subplot(212)\n",
"\n",
"ax.plot(range(1, n_count_data + 1), expected_views_per_day, lw=3, color=\"#404045\",\n",
" label=\"Expected Views\", ls=\"-.\")\n",
"\n",
"ax.plot(range(1, n_count_data + 1), expected_views_upper, lw=2, color=\"#7A68A6\", ls=\"--\")\n",
"\n",
"ax.plot(range(1, n_count_data + 1), expected_views_lower, lw=2, color=\"#7A68A6\", ls=\"--\",\n",
" label=\"95% Probabiliy Density\")\n",
"\n",
"ax.set_xticks(np.arange(0, len(df), 1) + 1)\n",
"ax.set_xlim(0.8, len(df) + 0.1)\n",
"ax.set_xlabel(\"Video ID\")\n",
"\n",
"ax.set_ylim(0, 3.8e5)\n",
"ax.set_ylabel(\"Views (x $100.000$)\")\n",
"ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
"\n",
"ax.axvline(37, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4)\n",
"ax.axvline(42, alpha=0.7, ls=\":\", color=\"#009E73\", lw=4, label=\"Modes of τ\")\n",
"\n",
"ax.fill_betweenx(y=[0, 3.8e5], x1=35, x2=45, alpha=0.08, color='#009E73', label=\"95% HPD for τ\")\n",
"\n",
"ax.scatter(np.arange(len(interest_variable))+1, interest_variable, color=\"#348ABD\", label=\"Views\")\n",
"\n",
"handles, labels = ax.get_legend_handles_labels()\n",
"labels_sorted = [labels[0], labels[2], labels[4], labels[1], labels[3]]\n",
"handles_sorted = [handles[0], handles[2], handles[4], handles[1], handles[3]]\n",
"\n",
"ax.set_title(\"Posterior Predictive Distribution of Views\")\n",
"ax.legend(handles_sorted, labels_sorted, loc=\"upper right\", ncol=2, prop={'size': 11})\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"ax = plt.subplot(221)\n",
"\n",
"ax.hist(mu_1_samples, bins=1400, label=f\"$\\mu_1$ Posterior Density\", density=True)\n",
"ax.hist(mu_2_samples, bins=800, label=f\"$\\mu_2$ Posterior Density\", density=True)\n",
"\n",
"ax.set_xlim(6e4, 22e4)\n",
"ax.set_xticks(np.arange(6e4, 22.5e4, 1e4))\n",
"ax.set_xlabel(\"Number of Views (x $100.000$)\")\n",
"ax.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0, 0))\n",
"\n",
"ax.set_ylim(0, 5.4e-5)\n",
"ax.set_yticks(np.arange(0, 5.5e-5, 5e-6))\n",
"ax.set_ylabel(\"Probability (x $10^{-5}$)\")\n",
"ax.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0, 0))\n",
"\n",
"ax.set_axisbelow(True)\n",
"ax.set_title(f\"Likelihood Parameters Posteriors\")\n",
"ax.legend(loc=\"upper right\", prop={'size': 11})\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"ax = plt.subplot(222)\n",
"\n",
"values = np.array(tau_samples, dtype=float)\n",
"bins = np.arange(values.min(), values.max() + 1.5) - 0.5\n",
"ax.hist(values, bins=bins, rwidth=0.9, density=True, label='τ Posterior Density')\n",
"sns.kdeplot(values, lw=3, bw=0.75, gridsize=200, linestyle=\"--\", color=\"#333333\", label=\"KDE for τ\")\n",
"\n",
"ax.set_xlim(29, 48)\n",
"ax.set_xticks(np.arange(29, 49, 1))\n",
"ax.set_xlabel(\"Video ID\")\n",
"\n",
"ax.set_ylim(0, 0.195)\n",
"ax.set_yticks(np.arange(0, 0.2, 0.02))\n",
"ax.set_ylabel(\"Probability (x $10^{-1}$)\")\n",
"ax.ticklabel_format(axis='y', style='sci', scilimits=(0,0))\n",
"\n",
"ax.set_axisbelow(True)\n",
"ax.set_title(\"Change Point Posterior\")\n",
"handles, labels = ax.get_legend_handles_labels()\n",
"ax.legend(handles[::-1], labels[::-1], loc=\"upper right\", prop={'size': 11})\n",
"\n",
"plt.suptitle('Posterior and Posterior Predictive Distributions', fontsize=16)\n",
"plt.tight_layout()\n",
"plt.subplots_adjust(top=0.91)\n",
"plt.savefig(\"posterior.pdf\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"resid = interest_variable - expected_views_per_day\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['Raw']['Negative Binomial without Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['Raw']['Negative Binomial without Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"normal_resid"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SQRT Views"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"pbs_data.csv\")\n",
"\n",
"datetime_full = pd.to_datetime(df['date'])\n",
"df['year'] = datetime_full.apply(lambda x:x.year)\n",
"df['month'] = datetime_full.apply(lambda x:x.month)\n",
"df['day'] = datetime_full.apply(lambda x:x.day)\n",
"\n",
"df = df.sort_values(['year', 'month', 'day'])\n",
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"interest_variable = np.sqrt(df['views_count'])\n",
"n_count_data = len(df)\n",
"variable = tt.as_tensor_variable(interest_variable)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Poisson with Outliers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_view_poisson_with_outliers:\n",
" \n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1_mean = variable[:tau].mean()\n",
" mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
" \n",
" mu2_mean = variable[tau:].mean()\n",
" mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = pm.math.switch(tau > idx, mu1, mu2)\n",
"\n",
" pm.Poisson(\"x\", switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_view_poisson_with_outliers:\n",
" db = pm.backends.Text('poisson_sqrt_with_outliers')\n",
" trace_views_poisson_with_outliers = pm.sample(draws=200000, tune=500000, chains=20, cores=4, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"duration = time.perf_counter() - before_model\n",
"print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_poisson_with_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_view_poisson_with_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_poisson_with_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_poisson_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_poisson_with_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_poisson_with_outliers['mu_1']\n",
"mu_2_samples = trace_views_poisson_with_outliers['mu_2']\n",
"tau_samples = trace_views_poisson_with_outliers['tau']\n",
"\n",
"all_data['pwo_sqrt_mu1'] = mu_1_samples\n",
"all_data['pwo_sqrt_mu2'] = mu_2_samples\n",
"all_data['pwo_sqrt_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
"\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_1$\")\n",
"\n",
"plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper center\", ncol=2)\n",
"ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
"\n",
"ax = axs[3]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"views_1_sample = np.random.poisson(mu_1_samples)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"views_2_sample = np.random.poisson(mu_2_samples)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)\n",
"\n",
"views_1_upper, views_1_lower, views_2_upper, views_2_lower"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"plt.show()\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['SQRT']['Poisson with Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['SQRT']['Poisson with Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Negative Binomial with Outlier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_views_nbinomial_with_outliers:\n",
" \n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1 = variable[tau:].mean()\n",
" sigma1_2 = variable[tau:].var()\n",
" alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
" \n",
" mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
" alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
" \n",
" mu2 = variable[:tau].mean()\n",
" sigma2_2 = variable[:tau].var()\n",
" alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
" \n",
" mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
" alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = tau > idx\n",
" \n",
" mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
" alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
"\n",
" pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_views_nbinomial_with_outliers:\n",
" db = pm.backends.Text('negative_binomial_sqrt_with_outliers')\n",
" trace_views_nbinomial_with_outliers = pm.sample(tune=80000, chains=20, cores=4, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"duration = time.perf_counter() - before_model\n",
"print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_nbinomial_with_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_views_nbinomial_with_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
"\n",
"ax = axs[2]\n",
"pm.forestplot(trace_views_nbinomial_with_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_nbinomial_with_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_nbinomial_with_outliers['mu_1']\n",
"mu_2_samples = trace_views_nbinomial_with_outliers['mu_2']\n",
"alpha_1_samples = trace_views_nbinomial_with_outliers['alpha_1']\n",
"alpha_2_samples = trace_views_nbinomial_with_outliers['alpha_2']\n",
"tau_samples = trace_views_nbinomial_with_outliers['tau']\n",
"\n",
"all_data['nbwo_sqrt_mu1'] = mu_1_samples\n",
"all_data['nbwo_sqrt_mu2'] = mu_2_samples\n",
"all_data['nbwo_sqrt_alpha1'] = alpha_1_samples\n",
"all_data['nbwo_sqrt_alpha2'] = alpha_2_samples\n",
"all_data['nbwo_sqrt_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_1$\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[3]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_1$\")\n",
"\n",
"ax = axs[4]\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_2$\")\n",
"\n",
"ax = axs[5]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[6]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
"views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
"views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" \n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"plt.show()\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['SQRT']['Negative Binomial with Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['SQRT']['Negative Binomial with Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Views"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"pbs_data.csv\")\n",
"\n",
"for _ in range(3):\n",
" index = df['views_count'].idxmax()\n",
" df = df.drop(index)\n",
"\n",
"datetime_full = pd.to_datetime(df['date'])\n",
"df['year'] = datetime_full.apply(lambda x:x.year)\n",
"df['month'] = datetime_full.apply(lambda x:x.month)\n",
"df['day'] = datetime_full.apply(lambda x:x.day)\n",
"\n",
"df = df.sort_values(['year', 'month', 'day'])\n",
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"interest_variable = np.sqrt(df['views_count'])\n",
"n_count_data = len(df)\n",
"variable = tt.as_tensor_variable(interest_variable)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Poisson without Outlier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_views_poisson_without_outliers:\n",
" \n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1_mean = variable[:tau].mean()\n",
" mu1 = pm.Exponential(\"mu_1\", 1 / mu1_mean)\n",
" \n",
" mu2_mean = variable[tau:].mean()\n",
" mu2 = pm.Exponential(\"mu_2\", 1 /mu2_mean)\n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = pm.math.switch(tau > idx, mu1, mu2)\n",
"\n",
" pm.Poisson(\"x\", switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_views_poisson_without_outliers:\n",
" db = pm.backends.Text('poisson_sqrt_without_outliers')\n",
" trace_views_poisson_without_outliers = pm.sample(tune=20000, chains=20, cores=4, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"duration = time.perf_counter() - before_model\n",
"print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_poisson_without_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.model_to_graphviz(model_views_poisson_without_outliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 2, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_poisson_without_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_poisson_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_poisson_without_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_poisson_without_outliers['mu_1']\n",
"mu_2_samples = trace_views_poisson_without_outliers['mu_2']\n",
"tau_samples = trace_views_poisson_without_outliers['tau']\n",
"\n",
"all_data['pwoo_sqrt_mu1'] = mu_1_samples\n",
"all_data['pwoo_sqrt_mu2'] = mu_2_samples\n",
"all_data['pwoo_sqrt_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(4, 1, figsize=(12.5, 10))\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_1$\")\n",
"\n",
"plt.title(r\"\"\"Posterior distributions of the variables $\\mu_1,\\;\\mu_2,\\;\\tau$\"\"\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$\\mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $\\mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=\"posterior of $\\mu_2$\", color=\"#7A68A6\", density=True)\n",
"ax.legend(loc=\"upper center\", ncol=2)\n",
"ax.set_xlabel(\"$\\mu_1$ vs $\\mu_2$\")\n",
"\n",
"ax = axs[3]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"views_1_sample = np.random.poisson(mu_1_samples)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"views_2_sample = np.random.poisson(mu_2_samples)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['SQRT']['Poisson without Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['SQRT']['Poisson without Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Negative Binomial without Outliers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"before_model = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pm.Model() as model_views_nbinomial_without_outliers:\n",
" \n",
" tau = pm.DiscreteUniform(\"tau\", lower=0, upper=len(df) - 1)\n",
" \n",
" mu1 = variable[tau:].mean()\n",
" sigma1_2 = variable[tau:].var()\n",
" alpha1 = (mu1 ** 2)/(sigma1_2 - mu1)\n",
" \n",
" mu1_ = pm.Exponential(\"mu_1\", 1 / mu1)\n",
" alpha1_ = pm.Exponential(\"alpha_1\", 1 / alpha1)\n",
" \n",
" mu2 = variable[:tau].mean()\n",
" sigma2_2 = variable[:tau].var()\n",
" alpha2 = (mu2 ** 2)/(sigma2_2 - mu2)\n",
" \n",
" mu2_ = pm.Exponential(\"mu_2\", 1 / mu2)\n",
" alpha2_ = pm.Exponential(\"alpha_2\", 1 / alpha2) \n",
" \n",
" idx = np.arange(len(df))\n",
" \n",
" switch = tau > idx\n",
" \n",
" mu_switch = pm.math.switch(switch, mu1_, mu2_)\n",
" alpha_switch = pm.math.switch(switch, alpha1_, alpha2_)\n",
"\n",
" pm.NegativeBinomial(\"x\", mu_switch, alpha_switch, observed=interest_variable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model_views_nbinomial_without_outliers:\n",
" db = pm.backends.Text('negative_binomial_sqrt_without_outliers')\n",
" trace_views_nbinomial_without_outliers = pm.sample(tune=20000, chains=20, cores=4, trace=db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"duration = time.perf_counter() - before_model\n",
"print(f\"Model Definition and fitting lasted: {duration:2.2f}s aprox {round(duration / 60):2.2f}min\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pm.summary(trace_views_nbinomial_without_outliers).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(1, 3, figsize=(16, 5))\n",
"\n",
"ax = axs[0]\n",
"pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['tau'], ax=ax);\n",
"\n",
"ax = axs[1]\n",
"pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['mu_1', 'mu_2'], ax=ax);\n",
"\n",
"ax = axs[2]\n",
"pm.forestplot(trace_views_nbinomial_without_outliers, var_names=['alpha_1', 'alpha_2'], ax=ax);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = az.from_pymc3(\n",
" trace=trace_views_nbinomial_without_outliers,\n",
")\n",
"az.plot_trace(data);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mu_1_samples = trace_views_nbinomial_without_outliers['mu_1']\n",
"mu_2_samples = trace_views_nbinomial_without_outliers['mu_2']\n",
"alpha_1_samples = trace_views_nbinomial_without_outliers['alpha_1']\n",
"alpha_2_samples = trace_views_nbinomial_without_outliers['alpha_2']\n",
"tau_samples = trace_views_nbinomial_without_outliers['tau']\n",
"\n",
"all_data['nbwoo_sqrt_mu1'] = mu_1_samples\n",
"all_data['nbwoo_sqrt_mu2'] = mu_2_samples\n",
"all_data['nbwoo_sqrt_alpha1'] = alpha_1_samples\n",
"all_data['nbwoo_sqrt_alpha2'] = alpha_2_samples\n",
"all_data['nbwoo_sqrt_tau'] = tau_samples\n",
"\n",
"with open('all_data.p', 'wb') as handle:\n",
" pickle.dump(all_data, handle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(7, 1, figsize=(12.5, 15))\n",
"\n",
"ax = axs[0]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_1$\")\n",
"\n",
"ax = axs[1]\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$mu_2$\")\n",
"\n",
"ax = axs[2]\n",
"ax.hist(mu_1_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_1$\", density=True)\n",
"ax.hist(mu_2_samples, bins=30, alpha=0.85, label=f\"posterior of $mu_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[3]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_1$\")\n",
"\n",
"ax = axs[4]\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(\"$alpha_2$\")\n",
"\n",
"ax = axs[5]\n",
"ax.hist(alpha_1_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_1$\", density=True)\n",
"ax.hist(alpha_2_samples, bins=30, alpha=0.85, label=f\"posterior of $alpha_2$\", density=True)\n",
"ax.legend(loc=\"upper left\")\n",
"\n",
"ax = axs[6]\n",
"w = 1 / tau_samples.shape[0] * np.ones_like(tau_samples)\n",
"ax.hist(tau_samples, bins=n_count_data, alpha=1,\n",
" label=r\"posterior of $\\tau$\",\n",
" color=\"#467821\", weights=w)\n",
"ax.legend(loc=\"upper left\")\n",
"ax.set_xlabel(r\"$\\tau$ (Video Number)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p_1 = alpha_1_samples / (mu_1_samples + alpha_1_samples)\n",
"views_1_sample = np.random.negative_binomial(alpha_1_samples, p_1)\n",
"views_1_upper = np.quantile(views_1_sample, 0.025)\n",
"views_1_lower = np.quantile(views_1_sample, 0.975)\n",
"\n",
"p_2 = alpha_2_samples / (mu_2_samples + alpha_2_samples)\n",
"views_2_sample = np.random.negative_binomial(alpha_2_samples, p_2)\n",
"views_2_upper = np.quantile(views_2_sample, 0.025)\n",
"views_2_lower = np.quantile(views_2_sample, 0.975)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" \n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Residuals Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resid = np.power(interest_variable, 2) - np.power(expected_views_per_day, 2)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"plt.scatter(np.arange(len(df)), resid)\n",
"plt.axhline(y=0, ls=\"--\", color=\"#404045\")\n",
"print(f\"MSE: {np.power(resid, 2).mean():4g}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(16, 5))\n",
"x, y = probplot(resid, plot=plt);\n",
"plt.tight_layout()\n",
"plt.show()\n",
"_, p = normaltest(resid)\n",
"print(f\"P-Value for Normal Test: {p:.4g}\")\n",
"if p > 1e-3:\n",
" print(\"Suggest that the residuals are normaly distributed\")\n",
"else:\n",
" print(\"Suggest that the residuals are NOT normaly distributed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse['SQRT']['Negative Binomial without Outlier'] = np.power(resid, 2).mean()\n",
"normal_resid['SQRT']['Negative Binomial without Outlier'] = p\n",
"\n",
"with open('mse.p', 'wb') as handle:\n",
" pickle.dump(mse, handle)\n",
"\n",
"with open('normal_resid.p', 'wb') as handle:\n",
" pickle.dump(normal_resid, handle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Showing all Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse_scaled = mse ** 0.5\n",
"mse_scaled = mse_scaled.drop('LOG', axis=1)\n",
"mse_scaled /= mse_scaled.min().min()\n",
"mse_scaled = mse_scaled.astype(float).round(2)\n",
"mse_scaled"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"normal_resid"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"normal_resid_scale = normal_resid.copy()\n",
"normal_resid_scale = normal_resid_scale.drop('LOG', axis=1)\n",
"normal_resid_scale *= 100\n",
"normal_resid_scale > 0.5\n",
"normal_resid_scale"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"elapsed_time = time.perf_counter() - start_running\n",
"print(f\"Total elapsed time: {elapsed_time:5.2f} s\")\n",
"print(f\"Total elapsed time: {int((elapsed_time / 60) % 60):5.0f} min\")\n",
"print(f\"Total elapsed time: {int((elapsed_time / 60 /60) % 24):5.0f} hs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"N = tau_samples.shape[0]\n",
"expected_views_per_day = np.zeros(n_count_data)\n",
"expected_views_upper = np.zeros(n_count_data)\n",
"expected_views_lower = np.zeros(n_count_data)\n",
"\n",
"plt.figure(figsize=(16, 5))\n",
"\n",
"for day in range(0, n_count_data):\n",
" ix = day < tau_samples\n",
" \n",
" expected_views_per_day[day] = (mu_1_samples[ix].sum() + mu_2_samples[~ix].sum()) / N\n",
" expected_views_upper[day] = (np.repeat(views_1_upper, len(ix))[ix].sum() + np.repeat(views_2_upper, len(ix))[~ix].sum()) / N\n",
" expected_views_lower[day] = (np.repeat(views_1_lower, len(ix))[ix].sum() + np.repeat(views_2_lower, len(ix))[~ix].sum()) / N\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_per_day, 2), lw=4, color=\"#E24A33\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_upper, 2), lw=2, color=\"royalblue\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.plot(range(n_count_data), np.power(expected_views_lower, 2), lw=2, color=\"firebrick\", ls=\"--\",\n",
" label=\"expected number of text-messages received\")\n",
"\n",
"plt.xticks(np.arange(0, len(df), 2))\n",
"plt.xlim(-1, len(df))\n",
"plt.xlabel(\"Video ID\")\n",
"plt.ylabel(\"Views\")\n",
"plt.title(\"Expected number of views per video\")\n",
"plt.scatter(np.arange(len(interest_variable)), np.power(interest_variable, 2), color=\"#348ABD\", alpha=0.65, label=\"observed texts per day\")\n",
"\n",
"plt.legend(loc=\"upper left\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Execution Time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"elapsed_time = time.perf_counter() - start_running\n",
"print(f\"Total elapsed time: {elapsed_time:5.2f} s\")\n",
"print(f\"Total elapsed time: {int((elapsed_time / 60) % 60):5.0f} min\")\n",
"print(f\"Total elapsed time: {int((elapsed_time / 60 /60) % 24):5.0f} hs\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
views_count date likes dislikes comments_count
118579 17 May 2018 4243 675 1290
102081 17 May 2018 1731 143 375
54220 26 Apr 2018 1415 65 188
60052 13 Apr 2018 1490 37 141
37976 29 Mar 2018 1569 42 167
98477 23 Mar 2018 2471 69 784
43906 15 Mar 2018 1080 51 208
82826 8 Mar 2018 2412 105 398
77348 1 Mar 2018 2257 26 377
82363 27 Feb 2018 3229 49 608
77653 15 Feb 2018 2008 46 278
133253 1 Feb 2018 3354 186 376
140062 25 Jan 2018 4496 90 639
69011 18 Jan 2018 1869 344 315
60526 11 Jan 2018 2126 34 187
67910 21 Dec 2017 1974 267 359
63290 14 Dec 2017 2872 34 249
84684 12 Dec 2017 2798 49 330
70405 30 Nov 2017 2721 120 404
104901 23 Nov 2017 4251 201 613
139189 17 Nov 2017 6433 95 954
114225 10 Nov 2017 3171 86 262
703089 19 Oct 2017 17149 991 2374
144368 12 Oct 2017 4645 75 547
69256 6 Oct 2017 1845 23 207
101029 28 Sep 2017 2907 58 319
127403 21 Sep 2017 3748 80 540
139737 14 Sep 2017 3635 81 684
226200 24 Aug 2017 6287 130 569
102046 23 Aug 2017 2929 44 275
137518 22 Aug 2017 3799 43 259
88791 10 Aug 2017 2985 190 659
265561 3 Aug 2017 8735 197 768
346312 21 Jul 2017 9094 644 1036
114411 13 Jul 2017 2824 63 392
83106 29 Jun 2017 2149 45 351
122523 22 Jun 2017 2359 109 607
106887 15 Jun 2017 3910 45 601
92765 8 Jun 2017 3078 90 420
98660 1 Jun 2017 4007 46 402
222122 19 May 2017 5939 188 889
126756 11 May 2017 3710 60 487
103940 4 May 2017 2810 68 413
198880 27 Apr 2017 4721 83 418
210136 20 Apr 2017 5472 104 480
193191 13 Apr 2017 4576 158 919
23375 10 Apr 2017 1415 62 275
89182 6 Apr 2017 2375 53 386
174447 23 Mar 2017 4461 98 488
91416 16 Mar 2017 2561 53 377
168412 9 Mar 2017 5278 110 662
312267 2 Mar 2017 6188 419 1156
72497 23 Feb 2017 2103 57 284
566168 16 Feb 2017 14809 336 1377
158059 2 Feb 2017 3683 53 546
241847 26 Jan 2017 6259 185 997
313571 19 Jan 2017 7921 140 980
153137 12 Jan 2017 4617 59 606
274378 5 Jan 2017 8077 404 1304
143995 22 Dec 2016 3565 75 403
169567 15 Dec 2016 4033 70 968
212652 8 Dec 2016 5400 145 1134
111195 1 Dec 2016 3529 46 548
318748 24 Nov 2016 8585 352 1826
872483 17 Nov 2016 23675 745 2391
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment