Chapter 3 Problems#
Notebook setup#
# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Useful colors
snspal = sns.color_palette()
blue, orange, purple = snspal[0], snspal[1], snspal[4]
# red = sns.color_palette("tab10")[3]
# High-resolution please
%config InlineBackend.figure_format = 'retina'
# set random seed for repeatability
np.random.seed(42)
\(\def\stderr#1{\mathbf{se}_{#1}}\) \(\def\stderrhat#1{\hat{\mathbf{se}}_{#1}}\) \(\newcommand{\Mean}{\textbf{Mean}}\) \(\newcommand{\Var}{\textbf{Var}}\) \(\newcommand{\Std}{\textbf{Std}}\) \(\newcommand{\Freq}{\textbf{Freq}}\) \(\newcommand{\RelFreq}{\textbf{RelFreq}}\) \(\newcommand{\DMeans}{\textbf{DMeans}}\) \(\newcommand{\Prop}{\textbf{Prop}}\) \(\newcommand{\DProps}{\textbf{DProps}}\)
(this cell contains the macro definitions like \(\stderr{\overline{\mathbf{x}}}\), \(\stderrhat{}\), \(\Mean\), …)
Problem NN: alt t-test for the mean of Batch 04 (Example 1BT)#
muK0 = 1000 # population mean (expected kombucha volume)
kombucha = pd.read_csv("../datasets/kombucha.csv")
ksample04 = kombucha[kombucha["batch"]==4]["volume"]
n04 = len(ksample04)
obsmean04 = np.mean(ksample04)
# bootstrap estimate for standard error of the mean
from ministats import gen_boot_dist
np.random.seed(42)
kbars_boot04 = gen_boot_dist(ksample04, estfunc=np.mean)
sehat_boot04 = np.std(kbars_boot04)
sehat_boot04
WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
1.225161704465105
# compute the t statistic using bootstrap se
obst04bt = (obsmean04 - muK0) / sehat_boot04
obst04bt
3.1289747190340322
from scipy.stats import t as tdist
from ministats import tailprobs
rvT04 = tdist(n04 - 1)
pvalue04bt = tailprobs(rvT04, obst04bt, alt="two-sided")
pvalue04bt
0.0033143496482337274
The \(p\)-value is very small, so our decision is to reject \(H_0\).
Problem NN: alt t-test for the mean of Batch 01 (Example 2BT)#
muK0 = 1000 # population mean (expected kombucha volume)
kombucha = pd.read_csv("../datasets/kombucha.csv")
ksample01 = kombucha[kombucha["batch"]==1]["volume"]
n01 = len(ksample01)
obsmean01 = np.mean(ksample01)
# bootstrap estimate for standard error of the mean
from ministats import gen_boot_dist
np.random.seed(42)
kbars_boot01 = gen_boot_dist(ksample01, estfunc=np.mean)
sehat_boot01 = np.std(kbars_boot01)
sehat_boot01
1.530831183342292
# compute the t statistic using bootstrap se
obst01bt = (obsmean01 - muK0) / sehat_boot01
obst01bt
-0.5854662550335628
from scipy.stats import t as tdist
from ministats import tailprobs
rvT01 = tdist(n01-1)
pvalue01bt = tailprobs(rvT01, obst01bt, alt="two-sided")
pvalue01bt
0.5616069624592428
The \(p\)-value is very large, so we have no reason to reject \(H_0\).
Problem NN: simulation test of mean#
Let’s write a reusable function for performing the simulation test for the mean.
#######################################################
def simulation_test_mean(sample, mu0, sigma0, N=10000):
"""
Compute the p-value of the observed mean of `sample`
under H0 of a normal distribution `norm(mu0,sigma0)`.
"""
# Compute the observed value of the mean
obsmean = mean(sample)
# Obtain the sampling dist. of the mean under H0
n = len(sample)
rvXH0 = norm(mu0, sigma0)
xbars = gen_sampling_dist(rvXH0, estfunc=mean, n=n)
# Compute the p-value
# tails = tailvalues(xbars, obsmean)
dev = abs(obsmean - mu0)
tails = [v for v in xbars if abs(v-muK0) >= dev]
pvalue = len(tails) / len(xbars)
return pvalue
## TEST 1 (Do we get the same answer for Batch 04?)
np.random.seed(42)
simulation_test_mean(ksample04, mu0=muK0, sigma0=sigmaK0)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 3
1 ## TEST 1 (Do we get the same answer for Batch 04?)
2 np.random.seed(42)
----> 3 simulation_test_mean(ksample04, mu0=muK0, sigma0=sigmaK0)
NameError: name 'sigmaK0' is not defined
## TEST 2 (Do we get the same answer for Batch 01?)
np.random.seed(42)
simulation_test_mean(ksample01, mu0=muK0, sigma0=sigmaK0)
0.5711
We have confirmed the function simulation_test_mean
works as expected.
We’ve added it to ministats
module so we can reuse it later.
Problem NN: simulation test of var#
#######################################################
def simulation_test_var(sample, mu0, sigma0, alt="greater"):
"""
Compute the p-value of the observed variance of `sample`
under H0 of a normal distribution `norm(mu0,sigma0)`.
"""
# 1. Compute the sample variance
obsvar = var(sample)
n = len(sample)
# 2. Get sampling distribution of variance under H0
rvXH0 = norm(mu0, sigma0)
xvars = gen_sampling_dist(rvXH0, estfunc=var, n=n)
# 3. Compute the p-value
tails = [xvar for xvar in xvars if xvar >= obsvar]
pvalue = len(tails) / len(xvars)
return pvalue
# reproduce the results from Example 3
np.random.seed(42)
simulation_test_var(ksample02, muK0, sigmaK0, alt="greater")
0.2132
# reproduce the results from Example 4
np.random.seed(43)
simulation_test_var(ksample08, muK0, sigmaK0, alt="greater")
0.0041
We confirm the function simulation_test_var
gives the same \(p\)-values as the two samples
we calculated manually above,
so we can be confident it is working as expected.