Section 2.3 — Inventory of discrete distributions#
This notebook contains all the code examples from Section 2.3 Inventory of discrete distributions of the No Bullshit Guide to Statistics.
Notebook setup#
# load Python modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Figures setup
sns.set_theme(
context="paper",
style="whitegrid",
palette="colorblind",
rc={'figure.figsize': (6,2)},
)
%config InlineBackend.figure_format = 'retina'
# simple float __repr__
np.set_printoptions(legacy='1.25')
/tmp/ipykernel_7872/1655438542.py:2: UserWarning: legacy printing option can currently only be '1.13', '1.21', or `False`
np.set_printoptions(legacy='1.25')
# set random seed for repeatability
np.random.seed(42)
%pip install -q ministats
Note: you may need to restart the kernel to use updated packages.
from ministats import plot_pmf
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[6], line 1
----> 1 from ministats import plot_pmf
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/ministats/__init__.py:8
4 __email__ = 'ivan@minireference.com'
5 __version__ = '0.3.22'
----> 8 from .bayes import (
9 mode_from_samples,
10 hdi_from_grid,
11 hdi_from_rv,
12 hdi_from_samples,
13 calc_dmeans_stats,
14 plot_dmeans_stats,
15 )
17 from .confidence_intervals import (
18 ci_mean,
19 ci_var,
20 ci_dmeans,
21 )
23 from .estimators import (
24 mean,
25 var,
(...)
29 quantile,
30 )
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/ministats/bayes.py:5
2 import logging
4 import arviz as az
----> 5 import bambi as bmb
6 import matplotlib.pyplot as plt
7 import numpy as np
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/bambi/__init__.py:5
1 import logging
3 from importlib.metadata import version
----> 5 from pymc import math
7 from .backend import inference_methods, PyMCModel
8 from .config import config
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pymc/__init__.py:48
43 augmented = f"{augmented} -fno-unwind-tables -fno-asynchronous-unwind-tables"
45 pytensor.config.gcc__cxxflags = augmented
---> 48 __set_compiler_flags()
50 from pymc import _version, gp, ode, sampling
51 from pymc.backends import *
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pymc/__init__.py:31, in __set_compiler_flags()
29 def __set_compiler_flags():
30 # Workarounds for PyTensor compiler problems on various platforms
---> 31 import pytensor
33 current = pytensor.config.gcc__cxxflags
34 augmented = f"{current} -Wno-c++11-narrowing"
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/__init__.py:120
116 return as_tensor_variable(x, **kwargs)
119 # isort: off
--> 120 from pytensor import scalar, tensor
121 from pytensor.compile import (
122 In,
123 Mode,
(...)
129 shared,
130 )
131 from pytensor.compile.function import function, function_dump
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/tensor/__init__.py:103
99 return len(var.data)
102 import pytensor.tensor.exceptions
--> 103 import pytensor.tensor.rewriting
104 from pytensor.gradient import grad, hessian, jacobian
106 # adds shared-variable constructors
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/tensor/rewriting/__init__.py:11
9 import pytensor.tensor.rewriting.jax
10 import pytensor.tensor.rewriting.linalg
---> 11 import pytensor.tensor.rewriting.math
12 import pytensor.tensor.rewriting.numba
13 import pytensor.tensor.rewriting.ofg
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/tensor/rewriting/math.py:2695
2692 get_clients_at_depth2 = partial(get_clients_at_depth, depth=2)
2694 # 1+erf(x)=>erfc(-x)
-> 2695 local_one_plus_erf = PatternNodeRewriter(
2696 (add, 1, (erf, "x")),
2697 (erfc, (neg, "x")),
2698 allow_multiple_clients=True,
2699 name="local_one_plus_erf",
2700 tracks=[erf],
2701 get_nodes=get_clients_at_depth1,
2702 )
2703 register_canonicalize(local_one_plus_erf)
2704 register_stabilize(local_one_plus_erf)
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/rewriting/basic.py:1597, in PatternNodeRewriter.__init__(self, in_pattern, out_pattern, allow_multiple_clients, skip_identities_fn, name, tracks, get_nodes, values_eq_approx)
1593 else:
1594 raise TypeError(
1595 "The pattern to search for must start with a specific Op instance."
1596 )
-> 1597 self.__doc__ = f"{self.__class__.__doc__}\n\nThis instance does: {self}\n"
1598 self.allow_multiple_clients = allow_multiple_clients
1599 self.skip_identities_fn = skip_identities_fn
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/rewriting/basic.py:1688, in PatternNodeRewriter.__str__(self)
1684 else:
1685 return str(pattern)
1687 return (
-> 1688 f"{pattern_to_str(self.in_pattern)} -> {pattern_to_str(self.out_pattern)}"
1689 )
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/rewriting/basic.py:1685, in PatternNodeRewriter.__str__.<locals>.pattern_to_str(pattern)
1683 return f"{a} subject to {b}"
1684 else:
-> 1685 return str(pattern)
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/etuples/core.py:288, in ExpressionTuple.__str__(self)
287 def __str__(self):
--> 288 return f"e({', '.join(tuple(str(i) for i in self._tuple))})"
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/etuples/core.py:288, in <genexpr>(.0)
287 def __str__(self):
--> 288 return f"e({', '.join(tuple(str(i) for i in self._tuple))})"
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/basic.py:803, in Constant.__str__(self)
802 def __str__(self):
--> 803 data_str = str(self.data).replace("\n", "")
804 if len(data_str) > 20:
805 data_str = data_str[:10].strip() + " ... " + data_str[-10:].strip()
File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/numpy/core/arrayprint.py:1599, in _array_str_implementation(a, max_line_width, precision, suppress_small, array2string)
1595 def _array_str_implementation(
1596 a, max_line_width=None, precision=None, suppress_small=None,
1597 array2string=array2string):
1598 """Internal version of array_str() that allows overriding array2string."""
-> 1599 if (_format_options['legacy'] <= 113 and
1600 a.shape == () and not a.dtype.names):
1601 return str(a.item())
1603 # the str of 0d arrays is a special case: It should appear like a scalar,
1604 # so floats are not truncated by `precision`, and strings are not wrapped
1605 # in quotes. So we return the str of the scalar value.
TypeError: '<=' not supported between instances of 'str' and 'int'
Discrete uniform distribution#
from scipy.stats import randint
alpha = 1
beta = 4
rvR = randint(alpha, beta+1)
Note the +1
we added to the second argument.
rvR.mean()
2.5
rvR.var()
1.25
The limits of the sample space of the random variable rvR
can be obtained by calling its .support()
method.
Probability mass function#
rvR.support()
(1, 4)
Bernoulli distribution#
from scipy.stats import bernoulli
rvB = bernoulli(p=0.3)
rvB.support()
(0, 1)
rvB.mean(), rvB.var()
(0.3, 0.21)
rvB.rvs(10)
array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1])
Mathematical interlude: counting combinations#
Factorials#
TODO explain + math formula
from scipy.special import factorial
factorial(4)
24.0
# # ALT.
# from math import factorial
# factorial(4)
The factorial of \(0\) is defined as 1: \(0!=1\).
factorial(0)
1.0
The factorial function grows grows very quickly:
[factorial(n) for n in [5, 6, 7, 8, 9, 10, 11, 12, 13]]
[120.0,
720.0,
5040.0,
40320.0,
362880.0,
3628800.0,
39916800.0,
479001600.0,
6227020800.0]
Combinations#
TODO explain + math formula
from scipy.special import comb
comb(5, 2)
10.0
from itertools import combinations
list(combinations({1,2,3,4,5}, 2))
[(1, 2),
(1, 3),
(1, 4),
(1, 5),
(2, 3),
(2, 4),
(2, 5),
(3, 4),
(3, 5),
(4, 5)]
Exercises#
from scipy.special import comb
comb(12,6)
924.0
from scipy.special import comb
comb(52,5)
2598960.0
Links#
Binomial distribution#
We’ll use the name rvX
because rvB
was already used for the Bernoulli random variable above.
from scipy.stats import binom
n = 20
p = 0.14
rvX = binom(n,p)
rvX.support()
(0, 20)
rvX.mean(), rvX.var()
(2.8000000000000003, 2.4080000000000004)
Poisson distribution#
from scipy.stats import poisson
lam = 10
rvP = poisson(lam)
rvP.pmf(8)
0.11259903214902009
rvP.cdf(8)
0.3328196787507191
## ALT. way to compute the value F_P(8) =
# sum([rvP.pmf(x) for x in range(0,8+1)])
Geometric distribution#
from scipy.stats import geom
rvG = geom(p = 0.2)
rvG.support()
(1, inf)
rvG.mean(), rvG.var()
(5.0, 20.0)
Negative binomial distribution#
from scipy.stats import nbinom
r = 10
p = 0.5
rvN = nbinom(r,p)
rvN.support()
(0, inf)
rvN.mean(), rvN.var()
(10.0, 20.0)
Computer models for discrete distributions#
TODO table models
TODO table methods
Building computer models for random variables#
# import the model family
from scipy.stats import randint
# choose parameters
alpha = 1 # start at 1
beta = 4 # stop at 4
# create the rv object
rvR = randint(alpha, beta+1)
Note the +1
we added to the second argument.
rvR.mean()
2.5
# verify using formula
(alpha + beta) / 2
2.5
rvR.var()
1.25
# verify using formula
((beta + 1 - alpha)**2 - 1) / 12
1.25
rvR.std()
1.118033988749895
The limits of the sample space of the random variable rvR
can be obtained by calling its .support()
method.
rvR.support()
(1, 4)
Probability calculations#
rvR.pmf(1) + rvR.pmf(2) + rvR.pmf(3)
0.75
[rvR.pmf(r) for r in range(1,3+1)]
[0.25, 0.25, 0.25]
sum([rvR.pmf(r) for r in range(1,3+1)])
0.75
rvR.cdf(3)
0.75
Plotting distributions#
Probability mass function#
To create a stem-plot of the probability mass function \(f_R\), we can use the following three-step procedure:
Create a range of inputs
rs
for the plot.Compute the value of \(f_R =\)
rvR
for each of the inputs and store the results as list of valuesfRs
.Plot the values
fRs
by calling the functionplt.stem(rs, fRs)
.
import matplotlib.pyplot as plt
rs = range(0, 8+1)
fRs = rvR.pmf(rs)
fRs = np.where(fRs == 0, np.nan, fRs) # set zero fRs to np.nan
plt.stem(rs, fRs, basefmt=" ");
Note, we used the np.where
statement
to replace zero entries in the list fRs
to the special Not a Number (np.nan
) values,
which excludes them from the plot.
This is arguably a cleaner representation of the PMF plot:
the function is only defined for the sample space \(\mathcal{R} = \{1,2,3,4\}\).
An alternative ,,,
Cumulative distribution function#
import numpy as np
import seaborn as sns
bs = np.linspace(0, 8, 1000)
FRs = rvR.cdf(bs)
sns.lineplot(x=bs, y=FRs);
Generating random observations#
Let’s generate 10 random observations from random variable rvR
:
np.random.seed(43)
rvR.rvs(10)
array([1, 1, 4, 2, 2, 3, 1, 4, 2, 4])
Explanations#
Binomial coefficient formula explained#
n = 6
[int(comb(n,k)) for k in range(0,n+1)]
[1, 6, 15, 20, 15, 6, 1]
We can draw Pascal’s triangle
by by repeating the above calculation for several values of n
.
for n in range(0,10):
row_ints = [int(comb(n,k)) for k in range(0,n+1)]
row_str = str(row_ints)
row_str = row_str.replace("[","").replace("]","").replace(",", " ")
row_str = row_str.center(60)
print(row_str)
1
1 1
1 2 1
1 3 3 1
1 4 6 4 1
1 5 10 10 5 1
1 6 15 20 15 6 1
1 7 21 35 35 21 7 1
1 8 28 56 70 56 28 8 1
1 9 36 84 126 126 84 36 9 1
Geometric sums#
r = 0.3
sum([r**n for n in range(0,100)])
1.4285714285714286
1 / (1 - r)
1.4285714285714286
Discussion#
Relations between distributions#
Exercises#
Exercise XX#
from scipy.stats import poisson
lam = 20
rvH = poisson(lam)
# a) Pr(H in {20,21,22})
rvH.pmf(20) + rvH.pmf(21) + rvH.pmf(22)
0.2503540762867841
# b) Pr({16≤H≤24})
rvH.cdf(24) - rvH.cdf(16-1)
0.6867142435340193
# c) F_H^{-1}(0.95)
rvH.ppf(0.95)
28.0