Section 5.4 — Bayesian difference between means#
This notebook contains the code examples from Section 5.4 Bayesian difference between means from the No Bullshit Guide to Statistics.
See also:
compare_iqs2_many_ways.ipynb
Examples: treszkai/best
Notebook setup#
# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import bambi as bmb
import arviz as az
# Figures setup
plt.clf() # needed otherwise `sns.set_theme` doesn"t work
from plot_helpers import RCPARAMS
RCPARAMS.update({"figure.figsize": (5, 3)}) # good for screen
# RCPARAMS.update({"figure.figsize": (5, 1.6)}) # good for print
sns.set_theme(
context="paper",
style="whitegrid",
palette="colorblind",
rc=RCPARAMS,
)
# High-resolution please
%config InlineBackend.figure_format = "retina"
<Figure size 640x480 with 0 Axes>
# set random seed for repeatability
np.random.seed(42)
#######################################################
Introduciton#
We’ll now revisit the problem of comparison between two groups from a Bayesian perspective.
Model#
The data models for the two groups are based on Student’s \(t\)-distribution:
We’ll use a normal prior for the means, half-\(t\) priors for standard deviations, and a gamma distribution for the degrees of freedom parameter:
Bambi formula objects#
import bambi as bmb
formula = bmb.Formula("y ~ 0 + group",
"sigma ~ 0 + group")
formula
Formula('y ~ 0 + group', 'sigma ~ 0 + group')
formula.main, formula.additionals
('y ~ 0 + group', ('sigma ~ 0 + group',))
formula.get_all_formulas()
['y ~ 0 + group', 'sigma ~ 0 + group']
Choosing priors for log-sigma#
Normal distribution for log-sigma#
from scipy.stats import norm
def get_lognormal(mu=0, sigma=1):
logsigs = np.linspace(-5, 5, 1000)
dlogsigs = norm(mu, sigma).pdf(logsigs)
sigmas = np.exp(logsigs)
# Apply the change of variables to get the density for sigma
# based on the Jacobian |d(log(sigma))/d(sigma)| = 1/sigma
dsigmas = dlogsigs / sigmas
return sigmas, dsigmas
with plt.rc_context({"figure.figsize":(6,2.2)}):
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True)
# PLOT 1
mu, sigma = 0, 1
sigmas, dsigmas = get_lognormal(mu, sigma)
sns.lineplot(x=sigmas, y=dsigmas, ax=ax1)
ax1.set_xlabel('$\\sigma$')
ax1.set_ylabel('density')
ax1.set_title(f'(a) LogNorm({mu},{sigma}) = exp($\\mathcal{{N}}$({mu},{sigma}))')
ax1.set_xlim(0, 10)
ax1.set_ylim(0, 0.7)
# PLOT 2
mu, sigma = 1, 2
sigmas, dsigmas = get_lognormal(mu, sigma)
sns.lineplot(x=sigmas, y=dsigmas, ax=ax2)
ax2.set_xlabel('$\\sigma$')
ax2.set_ylabel('density')
ax2.set_title(f'(b) LogNorm({mu},{sigma}) = exp($\\mathcal{{N}}$({mu},{sigma}))')
ax2.set_xlim(0, 10)
Choosing priors for the degrees of freedom parameter#
# FIGURES ONLY
from scipy.stats import gamma
from scipy.stats import expon
from ministats import plot_pdf
with plt.rc_context({"figure.figsize":(6,2.2)}):
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True)
# PLOT 1
alpha, beta = 2, 0.1
rv_nu = gamma(a=2, scale=1/beta)
plot_pdf(rv_nu, rv_name="ν", ax=ax1)
ax1.set_title(f'(a) Gamma($\\alpha=${alpha}, $\\beta=${beta})')
ax1.set_xlim(0, 100)
# PLOT 2
scale = 30
rv_nu_exp = expon(scale=scale)
plot_pdf(rv_nu_exp, rv_name="ν", ax=ax2)
ax2.set_title(f'(b) Expon($\\lambda=$1/{scale})')
ax2.set_xlim(0, 100)
Example 1: comparing electricity prices#
Electricity prices from East End and West End
Electricity prices dataset#
eprices = pd.read_csv("../datasets/eprices.csv")
eprices.groupby("loc")["price"].describe()
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
loc | ||||||||
East | 9.0 | 6.155556 | 0.877655 | 4.8 | 5.5 | 6.3 | 6.5 | 7.7 |
West | 9.0 | 9.155556 | 1.562139 | 6.8 | 8.3 | 8.6 | 10.0 | 11.8 |
eprices["price"].mean(), eprices["price"].std()
(7.655555555555556, 1.973120020267162)
Bayesian model#
Bambi model#
formula1 = bmb.Formula("price ~ 0 + loc",
"sigma ~ 0 + loc")
links1 = {"mu":"identity",
"sigma":"log"}
priors1 = {
"loc": bmb.Prior("Normal", mu=8, sigma=5),
"sigma": {
"loc": bmb.Prior("Normal", mu=0, sigma=1)
},
"nu": bmb.Prior("Gamma", alpha=2, beta=0.1),
}
mod1 = bmb.Model(formula=formula1,
family="t",
link=links1,
priors=priors1,
data=eprices)
mod1
Formula: price ~ 0 + loc
sigma ~ 0 + loc
Family: t
Link: mu = identity
sigma = log
Observations: 18
Priors:
target = mu
Common-level effects
loc ~ Normal(mu: 8.0, sigma: 5.0)
Auxiliary parameters
nu ~ Gamma(alpha: 2.0, beta: 0.1)
target = sigma
Common-level effects
sigma_loc ~ Normal(mu: 0.0, sigma: 1.0)
mod1.build()
mod1.graph()
Prior predictive checks#
# TODO
Model fitting and analysis#
idata1 = mod1.fit(random_seed=[42,43,44,45])
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/pymc/sampling/mcmc.py:764: UserWarning: A list or tuple of random_seed no longer specifies the specific random_seed of each chain. Use a single seed instead.
warnings.warn(
Initializing NUTS using jitter+adapt_diag...
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/pytensor/link/c/cmodule.py:2959: UserWarning: PyTensor could not link to a BLAS installation. Operations that might benefit from BLAS will be severely degraded.
This usually happens when PyTensor is installed via pip. We recommend it be installed via conda/mamba/pixi instead.
Alternatively, you can use an experimental backend such as Numba or JAX that perform their own BLAS optimizations, by setting `pytensor.config.mode == 'NUMBA'` or passing `mode='NUMBA'` when compiling a PyTensor function.
For more options and details see https://pytensor.readthedocs.io/en/latest/troubleshooting.html#how-do-i-configure-test-my-blas-library
warnings.warn(
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [nu, loc, sigma_loc]
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 2 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
Processing the results#
Calculate derived quantities used for the analysis plots and summaries.
post1 = idata1["posterior"]
# Calculate sigmas from log-sigmas
#######################################################
logsig_W = post1["sigma_loc"].sel(sigma_loc_dim="West")
logsig_E = post1["sigma_loc"].sel(sigma_loc_dim="East")
post1["sigma_West"] = np.exp(logsig_W)
post1["sigma_East"] = np.exp(logsig_E)
# Calculate the difference between between means
post1["mu_West"] = post1["loc"].sel(loc_dim="West")
post1["mu_East"] = post1["loc"].sel(loc_dim="East")
post1["dmeans"] = post1["mu_West"] - post1["mu_East"]
# Calculate the difference between standard deviations
post1["dstd"] = post1["sigma_West"]-post1["sigma_East"]
# Effect size
#######################################################
pvar =(post1["sigma_West"]**2+post1["sigma_East"]**2)/2
post1["cohend"] = post1["dmeans"] / np.sqrt(pvar)
# #ALT: use helper function
# from ministats import calc_dmeans_stats
# calc_dmeans_stats(idata1, group_name="loc")
az.summary(idata1, kind="stats", hdi_prob=0.90,
var_names=["mu_West", "mu_East", "dmeans", "sigma_West", "sigma_East", "dstd", "nu", "cohend"])
mean | sd | hdi_5% | hdi_95% | |
---|---|---|---|---|
mu_West | 9.123 | 0.587 | 8.233 | 10.093 |
mu_East | 6.150 | 0.302 | 5.695 | 6.687 |
dmeans | 2.973 | 0.650 | 1.940 | 4.064 |
sigma_West | 1.589 | 0.492 | 0.847 | 2.261 |
sigma_East | 0.904 | 0.260 | 0.513 | 1.267 |
dstd | 0.685 | 0.564 | -0.177 | 1.576 |
nu | 20.687 | 13.481 | 2.018 | 39.852 |
cohend | 2.379 | 0.686 | 1.250 | 3.436 |
Compare to frequentist results#
from scipy.stats import ttest_ind
pricesW = eprices[eprices["loc"]=="West"]["price"]
pricesE = eprices[eprices["loc"]=="East"]["price"]
res1 = ttest_ind(pricesW, pricesE, equal_var=False)
res1.statistic, res1.pvalue
(5.022875513276464, 0.0002570338337217609)
res1.confidence_interval(confidence_level=0.9)
ConfidenceInterval(low=1.9396575883681466, high=4.060342411631854)
from ministats import cohend2
cohend2(pricesW, pricesE)
2.3678062243290983
Conclusions#
Example 2: comparing IQ scores#
We’ll look at IQ scores data taken from a the paper Bayesian Estimation Supersedes the t-Test (BEST) by John K. Kruschke.
smart drug administered to treatment
group and want to compare to control
group. Data contains outliers)
cf. compare_iqs2_many_ways.ipynb
IQ scores dataset#
iqs2 = pd.read_csv("../datasets/iqs2.csv")
iqs2.groupby("group")["iq"].describe()
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
group | ||||||||
ctrl | 42.0 | 100.357143 | 2.516496 | 88.0 | 100.0 | 100.5 | 101.0 | 105.0 |
treat | 47.0 | 101.914894 | 6.021085 | 82.0 | 100.0 | 102.0 | 103.0 | 124.0 |
Bayesian model#
TODO: add formulas
Bambi model#
formula2 = bmb.Formula("iq ~ 0 + group",
"sigma ~ 0 + group")
priors2 = {
"group": bmb.Prior("Normal", mu=100, sigma=35),
"sigma": {
"group": bmb.Prior("Normal", mu=1, sigma=2)
},
"nu": bmb.Prior("Gamma", alpha=2, beta=0.1),
}
mod2 = bmb.Model(formula=formula2,
family="t",
# link={"mu":"identity", "sigma":"log"}, # Bambi defaults
priors=priors2,
data=iqs2)
mod2
Formula: iq ~ 0 + group
sigma ~ 0 + group
Family: t
Link: mu = identity
sigma = log
Observations: 89
Priors:
target = mu
Common-level effects
group ~ Normal(mu: 100.0, sigma: 35.0)
Auxiliary parameters
nu ~ Gamma(alpha: 2.0, beta: 0.1)
target = sigma
Common-level effects
sigma_group ~ Normal(mu: 1.0, sigma: 2.0)
mod2.build()
mod2.graph()
Model fitting and analysis#
idata2 = mod2.fit(random_seed=[42,43,44,45])
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/pymc/sampling/mcmc.py:764: UserWarning: A list or tuple of random_seed no longer specifies the specific random_seed of each chain. Use a single seed instead.
warnings.warn(
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [nu, group, sigma_group]
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 2 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
from ministats import calc_dmeans_stats
calc_dmeans_stats(idata2, group_name="group");
az.summary(idata2, kind="stats", hdi_prob=0.95,
var_names=["mu_treat", "mu_ctrl", "dmeans", "sigma_treat", "sigma_ctrl", "dstd", "nu", "cohend"])
mean | sd | hdi_2.5% | hdi_97.5% | |
---|---|---|---|---|
mu_treat | 101.539 | 0.384 | 100.773 | 102.286 |
mu_ctrl | 100.523 | 0.200 | 100.130 | 100.919 |
dmeans | 1.016 | 0.432 | 0.122 | 1.840 |
sigma_treat | 2.008 | 0.430 | 1.242 | 2.908 |
sigma_ctrl | 1.030 | 0.204 | 0.658 | 1.416 |
dstd | 0.977 | 0.442 | 0.079 | 1.813 |
nu | 1.862 | 0.488 | 1.051 | 2.857 |
cohend | 0.657 | 0.303 | 0.074 | 1.280 |
Compare to frequentist results#
from scipy.stats import ttest_ind
treated = iqs2[iqs2["group"]=="treat"]["iq"].values
controls = iqs2[iqs2["group"]=="ctrl"]["iq"].values
res2 = ttest_ind(treated, controls, equal_var=False)
res2.statistic, res2.pvalue
(1.622190457290228, 0.10975381983712831)
res2.confidence_interval(confidence_level=0.95)
ConfidenceInterval(low=-0.36118477236636837, high=3.4766862921232016)
from ministats import cohend2
cohend2(treated, controls)
0.3309654530139732
# MAYBE
# # Test if the variances of the two groups are the same
# from scipy.stats import levene
# levene(treated, controls)
Conclusions#
Explanations#
Alternative choices of priors#
e.g. prior for sigma as very wiiiiiide, and nu ~ Exp(1/29)+1 → BEST Bayesian estimation supersedes the t-test by John K. Kruschke
If we use a model with common standard deviation, we get equivalent of pooled sigma
Sensitivity analysis and robustness checks#
# TODO
Performance tests#
Simulated datasets#
Simulate data grid consisting of ∆ * outliers * n * m
Model comparisons#
Compare results several models:
N model
T model (robust)
BayesFactor based on JZS prior
Classical two-sample t-test (from Section 4.5)
Discussion#
Comparison to the frequentist two-sample t-test#
results numerically similar
note we’re using 𝒯 as data model, not as a sampling distribution
conceptually different:
p-value vs. decision based on posterior distribution
confidence intervals vs. credible intervals
Comparing multiple groups#
?Extension to Bayesian ANOVA? Can extend approach to multiple groups: Bayesian ANOVA
FWD reference to hierarchical models for group comparison covered in Section 5.5
Exercises#
Exercise 1: small samples#
As = [5.77, 5.33, 4.59, 4.33, 3.66, 4.48]
Bs = [3.88, 3.55, 3.29, 2.59, 2.33, 3.59]
groups = ["A"]*len(As) + ["B"]*len(Bs)
df1 = pd.DataFrame({"group": groups, "vals": As + Bs})
# df1
Exercise 2: lecture and debate curriculums#
students = pd.read_csv("../datasets/students.csv")
# students.groupby("curriculum")["score"].describe()
formula_std = bmb.Formula("score ~ 0 + curriculum",
"sigma ~ 0 + curriculum")
priors_std = {
"curriculum": bmb.Prior("Normal", mu=70, sigma=30),
"sigma": {
"curriculum":bmb.Prior("Normal", mu=1, sigma=2)
},
"nu": bmb.Prior("Gamma", alpha=2, beta=0.1),
}
mod_std = bmb.Model(formula=formula_std,
family="t",
link="identity",
priors=priors_std,
data=students)
mod_std
# mod_std.build()
# mod_std.graph()
Formula: score ~ 0 + curriculum
sigma ~ 0 + curriculum
Family: t
Link: mu = identity
sigma = log
Observations: 15
Priors:
target = mu
Common-level effects
curriculum ~ Normal(mu: 70.0, sigma: 30.0)
Auxiliary parameters
nu ~ Gamma(alpha: 2.0, beta: 0.1)
target = sigma
Common-level effects
sigma_curriculum ~ Normal(mu: 1.0, sigma: 2.0)
idata_std = mod_std.fit()
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [nu, curriculum, sigma_curriculum]
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 2 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
# Calculate the difference between between means
currs = idata_std["posterior"]["curriculum"]
mu_debate = currs.sel(curriculum_dim="debate")
mu_lecture = currs.sel(curriculum_dim="lecture")
idata_std["posterior"]["dmeans"] = mu_debate - mu_lecture
az.summary(idata_std, kind="stats", var_names=["dmeans"], hdi_prob=0.95)
mean | sd | hdi_2.5% | hdi_97.5% | |
---|---|---|---|---|
dmeans | 7.399 | 5.258 | -2.861 | 17.366 |
# from ministats import plot_dmeans_stats
# plot_dmeans_stats(mod_std, idata_std, ppc_xlims=[50,100], group_name="curriculum");
# from scipy.stats import ttest_ind
# scoresD = students[students["curriculum"]=="debate"]["score"]
# scoresL = students[students["curriculum"]=="lecture"]["score"]
# res_std = ttest_ind(scoresD, scoresL, equal_var=False)
# res_std.statistic, res_std.pvalue
# from ministats import cohend2
# cohend2(scoresL, scoresD)
Exercise 3: redo exercises from Section 3.5 section using Bayesian methods#
# TODO
Links#
BONUS Examples#
Example 4: small example form BEST vignette#
See http://cran.nexr.com/web/packages/BEST/vignettes/BEST.pdf#page=2
y1s = [5.77, 5.33, 4.59, 4.33, 3.66, 4.48]
y2s = [3.88, 3.55, 3.29, 2.59, 2.33, 3.59]
from ministats.bayes import bayes_dmeans
mod4, idata4 = bayes_dmeans(y1s, y2s, groups=["y1", "y2"])
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [nu, Intercept, group, sigma_group]
Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 3 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
from ministats import calc_dmeans_stats
calc_dmeans_stats(idata4)
az.summary(idata4, kind="stats", hdi_prob=0.95,
var_names=["dmeans", "sigma_y1", "sigma_y2", "dstd", "nu", "cohend"])
mean | sd | hdi_2.5% | hdi_97.5% | |
---|---|---|---|---|
dmeans | -1.436 | 0.477 | -2.433 | -0.549 |
sigma_y1 | 0.852 | 0.340 | 0.368 | 1.530 |
sigma_y2 | 0.719 | 0.268 | 0.327 | 1.282 |
dstd | -0.133 | 0.428 | -0.960 | 0.748 |
nu | 21.391 | 14.443 | 1.349 | 49.059 |
cohend | -1.911 | 0.767 | -3.497 | -0.556 |
# from ministats import plot_dmeans_stats
# plot_dmeans_stats(mod4, idata4, ppc_xlims=None);
Example 5: comparing morning to evening#
morning = [8.99, 9.21, 9.03, 9.15, 8.68, 8.82, 8.66, 8.82, 8.59, 8.14,
9.09, 8.80, 8.18, 9.23, 8.55, 9.03, 9.36, 9.06, 9.57, 8.38]
evening = [9.82, 9.34, 9.73, 9.93, 9.33, 9.41, 9.48, 9.14, 8.62, 8.60,
9.60, 9.41, 8.43, 9.77, 8.96, 9.81, 9.75, 9.50, 9.90, 9.13]
from ministats.bayes import bayes_dmeans
mod5, idata5 = bayes_dmeans(evening, morning, groups=["evening", "morning"])
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [nu, Intercept, group, sigma_group]
Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 3 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
from ministats import calc_dmeans_stats
calc_dmeans_stats(idata5)
az.summary(idata5, kind="stats", hdi_prob=0.95,
var_names=["dmeans", "sigma_evening", "sigma_morning", "dstd", "nu", "cohend"])
mean | sd | hdi_2.5% | hdi_97.5% | |
---|---|---|---|---|
dmeans | -0.528 | 0.141 | -0.808 | -0.256 |
sigma_evening | 0.455 | 0.086 | 0.307 | 0.620 |
sigma_morning | 0.389 | 0.074 | 0.249 | 0.536 |
dstd | -0.066 | 0.112 | -0.279 | 0.157 |
nu | 23.069 | 14.566 | 3.229 | 51.788 |
cohend | -1.262 | 0.377 | -2.015 | -0.535 |
# from ministats import plot_dmeans_stats
# plot_dmeans_stats(mod5, idata5, ppc_xlims=None);