Chapter 3 Problems#

Notebook setup#

# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Useful colors
snspal = sns.color_palette()
blue, orange, purple = snspal[0], snspal[1], snspal[4]
# red = sns.color_palette("tab10")[3]

# High-resolution please
%config InlineBackend.figure_format = 'retina'
# set random seed for repeatability
np.random.seed(42)

\(\def\stderr#1{\mathbf{se}_{#1}}\) \(\def\stderrhat#1{\hat{\mathbf{se}}_{#1}}\) \(\newcommand{\Mean}{\textbf{Mean}}\) \(\newcommand{\Var}{\textbf{Var}}\) \(\newcommand{\Std}{\textbf{Std}}\) \(\newcommand{\Freq}{\textbf{Freq}}\) \(\newcommand{\RelFreq}{\textbf{RelFreq}}\) \(\newcommand{\DMeans}{\textbf{DMeans}}\) \(\newcommand{\Prop}{\textbf{Prop}}\) \(\newcommand{\DProps}{\textbf{DProps}}\)

\[ \newcommand{\CI}[1]{\textbf{CI}_{#1}} \newcommand{\CIL}[1]{\textbf{L}_{#1}} \newcommand{\CIU}[1]{\textbf{U}_{#1}} \newcommand{\ci}[1]{\textbf{ci}_{#1}} \newcommand{\cil}[1]{\textbf{l}_{#1}} \newcommand{\ciu}[1]{\textbf{u}_{#1}} \]

(this cell contains the macro definitions like \(\stderr{\overline{\mathbf{x}}}\), \(\stderrhat{}\), \(\Mean\), …)

Problem NN: alt t-test for the mean of Batch 04 (Example 1BT)#

muK0 = 1000   # population mean (expected kombucha volume)
kombucha = pd.read_csv("../datasets/kombucha.csv")
ksample04 = kombucha[kombucha["batch"]==4]["volume"]
n04 = len(ksample04)
obsmean04 = np.mean(ksample04)
# bootstrap estimate for standard error of the mean
from ministats import gen_boot_dist

np.random.seed(42)
kbars_boot04 = gen_boot_dist(ksample04, estfunc=np.mean)
sehat_boot04 = np.std(kbars_boot04)
sehat_boot04
WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
1.225161704465105
# compute the t statistic using bootstrap se
obst04bt = (obsmean04 - muK0) / sehat_boot04
obst04bt
3.1289747190340322
from scipy.stats import t as tdist
from ministats import tailprobs
rvT04 = tdist(n04 - 1)
pvalue04bt = tailprobs(rvT04, obst04bt, alt="two-sided")
pvalue04bt
0.0033143496482337274

The \(p\)-value is very small, so our decision is to reject \(H_0\).

Problem NN: alt t-test for the mean of Batch 01 (Example 2BT)#

muK0 = 1000   # population mean (expected kombucha volume)
kombucha = pd.read_csv("../datasets/kombucha.csv")
ksample01 = kombucha[kombucha["batch"]==1]["volume"]
n01 = len(ksample01)
obsmean01 = np.mean(ksample01)
# bootstrap estimate for standard error of the mean
from ministats import gen_boot_dist
np.random.seed(42)
kbars_boot01 = gen_boot_dist(ksample01, estfunc=np.mean)
sehat_boot01 = np.std(kbars_boot01)
sehat_boot01
1.530831183342292
# compute the t statistic using bootstrap se
obst01bt = (obsmean01 - muK0) / sehat_boot01
obst01bt
-0.5854662550335628
from scipy.stats import t as tdist
from ministats import tailprobs
rvT01 = tdist(n01-1)
pvalue01bt = tailprobs(rvT01, obst01bt, alt="two-sided")
pvalue01bt
0.5616069624592428

The \(p\)-value is very large, so we have no reason to reject \(H_0\).

Problem NN: simulation test of mean#

Let’s write a reusable function for performing the simulation test for the mean.

#######################################################

def simulation_test_mean(sample, mu0, sigma0, N=10000):
    """
    Compute the p-value of the observed mean of `sample`
    under H0 of a normal distribution `norm(mu0,sigma0)`.
    """
    # Compute the observed value of the mean
    obsmean = mean(sample)

    # Obtain the sampling dist. of the mean under H0
    n = len(sample)
    rvXH0 = norm(mu0, sigma0)
    xbars = gen_sampling_dist(rvXH0, estfunc=mean, n=n)

    # Compute the p-value
    # tails = tailvalues(xbars, obsmean)
    dev = abs(obsmean - mu0)
    tails = [v for v in xbars if abs(v-muK0) >= dev]
    pvalue = len(tails) / len(xbars)
    return pvalue
## TEST 1 (Do we get the same answer for Batch 04?)
np.random.seed(42)
simulation_test_mean(ksample04, mu0=muK0, sigma0=sigmaK0)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 3
      1 ## TEST 1 (Do we get the same answer for Batch 04?)
      2 np.random.seed(42)
----> 3 simulation_test_mean(ksample04, mu0=muK0, sigma0=sigmaK0)

NameError: name 'sigmaK0' is not defined
## TEST 2 (Do we get the same answer for Batch 01?)
np.random.seed(42)
simulation_test_mean(ksample01, mu0=muK0, sigma0=sigmaK0)
0.5711

We have confirmed the function simulation_test_mean works as expected. We’ve added it to ministats module so we can reuse it later.

Problem NN: simulation test of var#

#######################################################

def simulation_test_var(sample, mu0, sigma0, alt="greater"):
    """
    Compute the p-value of the observed variance of `sample`
    under H0 of a normal distribution `norm(mu0,sigma0)`.
    """
    # 1. Compute the sample variance
    obsvar = var(sample)
    n = len(sample)

    # 2. Get sampling distribution of variance under H0
    rvXH0 = norm(mu0, sigma0)
    xvars = gen_sampling_dist(rvXH0, estfunc=var, n=n)

    # 3. Compute the p-value
    tails = [xvar for xvar in xvars if xvar >= obsvar]
    pvalue = len(tails) / len(xvars)
    return pvalue
# reproduce the results from Example 3
np.random.seed(42)
simulation_test_var(ksample02, muK0, sigmaK0, alt="greater")
0.2132
# reproduce the results from Example 4
np.random.seed(43)
simulation_test_var(ksample08, muK0, sigmaK0, alt="greater")
0.0041

We confirm the function simulation_test_var gives the same \(p\)-values as the two samples we calculated manually above, so we can be confident it is working as expected.