Section 2.3 — Inventory of discrete distributions#

This notebook contains all the code examples from Section 2.3 Inventory of discrete distributions of the No Bullshit Guide to Statistics.

Notebook setup#

# load Python modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Figures setup
sns.set_theme(
    context="paper",
    style="whitegrid",
    palette="colorblind",
    rc={'figure.figsize': (6,2)},
)

%config InlineBackend.figure_format = 'retina'
# simple float __repr__
np.set_printoptions(legacy='1.25')
/tmp/ipykernel_7872/1655438542.py:2: UserWarning: legacy printing option can currently only be '1.13', '1.21', or `False`
  np.set_printoptions(legacy='1.25')
# set random seed for repeatability
np.random.seed(42)
%pip install -q ministats
Note: you may need to restart the kernel to use updated packages.
from ministats import plot_pmf
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 from ministats import plot_pmf

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/ministats/__init__.py:8
      4 __email__ = 'ivan@minireference.com'
      5 __version__ = '0.3.22'
----> 8 from .bayes import (
      9     mode_from_samples,
     10     hdi_from_grid,
     11     hdi_from_rv,
     12     hdi_from_samples,
     13     calc_dmeans_stats,
     14     plot_dmeans_stats,
     15 )
     17 from .confidence_intervals import (
     18     ci_mean,
     19     ci_var,
     20     ci_dmeans,
     21 )
     23 from .estimators import (
     24     mean,
     25     var,
   (...)
     29     quantile,
     30 )

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/ministats/bayes.py:5
      2 import logging
      4 import arviz as az
----> 5 import bambi as bmb
      6 import matplotlib.pyplot as plt
      7 import numpy as np

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/bambi/__init__.py:5
      1 import logging
      3 from importlib.metadata import version
----> 5 from pymc import math
      7 from .backend import inference_methods, PyMCModel
      8 from .config import config

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pymc/__init__.py:48
     43     augmented = f"{augmented} -fno-unwind-tables -fno-asynchronous-unwind-tables"
     45     pytensor.config.gcc__cxxflags = augmented
---> 48 __set_compiler_flags()
     50 from pymc import _version, gp, ode, sampling
     51 from pymc.backends import *

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pymc/__init__.py:31, in __set_compiler_flags()
     29 def __set_compiler_flags():
     30     # Workarounds for PyTensor compiler problems on various platforms
---> 31     import pytensor
     33     current = pytensor.config.gcc__cxxflags
     34     augmented = f"{current} -Wno-c++11-narrowing"

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/__init__.py:120
    116     return as_tensor_variable(x, **kwargs)
    119 # isort: off
--> 120 from pytensor import scalar, tensor
    121 from pytensor.compile import (
    122     In,
    123     Mode,
   (...)
    129     shared,
    130 )
    131 from pytensor.compile.function import function, function_dump

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/tensor/__init__.py:103
     99     return len(var.data)
    102 import pytensor.tensor.exceptions
--> 103 import pytensor.tensor.rewriting
    104 from pytensor.gradient import grad, hessian, jacobian
    106 # adds shared-variable constructors

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/tensor/rewriting/__init__.py:11
      9 import pytensor.tensor.rewriting.jax
     10 import pytensor.tensor.rewriting.linalg
---> 11 import pytensor.tensor.rewriting.math
     12 import pytensor.tensor.rewriting.numba
     13 import pytensor.tensor.rewriting.ofg

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/tensor/rewriting/math.py:2695
   2692 get_clients_at_depth2 = partial(get_clients_at_depth, depth=2)
   2694 # 1+erf(x)=>erfc(-x)
-> 2695 local_one_plus_erf = PatternNodeRewriter(
   2696     (add, 1, (erf, "x")),
   2697     (erfc, (neg, "x")),
   2698     allow_multiple_clients=True,
   2699     name="local_one_plus_erf",
   2700     tracks=[erf],
   2701     get_nodes=get_clients_at_depth1,
   2702 )
   2703 register_canonicalize(local_one_plus_erf)
   2704 register_stabilize(local_one_plus_erf)

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/rewriting/basic.py:1597, in PatternNodeRewriter.__init__(self, in_pattern, out_pattern, allow_multiple_clients, skip_identities_fn, name, tracks, get_nodes, values_eq_approx)
   1593 else:
   1594     raise TypeError(
   1595         "The pattern to search for must start with a specific Op instance."
   1596     )
-> 1597 self.__doc__ = f"{self.__class__.__doc__}\n\nThis instance does: {self}\n"
   1598 self.allow_multiple_clients = allow_multiple_clients
   1599 self.skip_identities_fn = skip_identities_fn

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/rewriting/basic.py:1688, in PatternNodeRewriter.__str__(self)
   1684     else:
   1685         return str(pattern)
   1687 return (
-> 1688     f"{pattern_to_str(self.in_pattern)} -> {pattern_to_str(self.out_pattern)}"
   1689 )

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/rewriting/basic.py:1685, in PatternNodeRewriter.__str__.<locals>.pattern_to_str(pattern)
   1683     return f"{a} subject to {b}"
   1684 else:
-> 1685     return str(pattern)

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/etuples/core.py:288, in ExpressionTuple.__str__(self)
    287 def __str__(self):
--> 288     return f"e({', '.join(tuple(str(i) for i in self._tuple))})"

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/etuples/core.py:288, in <genexpr>(.0)
    287 def __str__(self):
--> 288     return f"e({', '.join(tuple(str(i) for i in self._tuple))})"

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/pytensor/graph/basic.py:803, in Constant.__str__(self)
    802 def __str__(self):
--> 803     data_str = str(self.data).replace("\n", "")
    804     if len(data_str) > 20:
    805         data_str = data_str[:10].strip() + " ... " + data_str[-10:].strip()

File /opt/hostedtoolcache/Python/3.10.18/x64/lib/python3.10/site-packages/numpy/core/arrayprint.py:1599, in _array_str_implementation(a, max_line_width, precision, suppress_small, array2string)
   1595 def _array_str_implementation(
   1596         a, max_line_width=None, precision=None, suppress_small=None,
   1597         array2string=array2string):
   1598     """Internal version of array_str() that allows overriding array2string."""
-> 1599     if (_format_options['legacy'] <= 113 and
   1600             a.shape == () and not a.dtype.names):
   1601         return str(a.item())
   1603     # the str of 0d arrays is a special case: It should appear like a scalar,
   1604     # so floats are not truncated by `precision`, and strings are not wrapped
   1605     # in quotes. So we return the str of the scalar value.

TypeError: '<=' not supported between instances of 'str' and 'int'

Discrete uniform distribution#

from scipy.stats import randint

alpha = 1
beta = 4
rvR = randint(alpha, beta+1)

Note the +1 we added to the second argument.

rvR.mean()
2.5
rvR.var()
1.25

The limits of the sample space of the random variable rvR can be obtained by calling its .support() method.

Probability mass function#

plot_pmf(rvR, xlims=[0,8+1]);
rvR.support()
(1, 4)

Bernoulli distribution#

from scipy.stats import bernoulli

rvB = bernoulli(p=0.3)
rvB.support()
(0, 1)
rvB.mean(), rvB.var()
(0.3, 0.21)
rvB.rvs(10)
array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1])
plot_pmf(rvB, xlims=[0,5]);

Mathematical interlude: counting combinations#

Factorials#

TODO explain + math formula

from scipy.special import factorial

factorial(4)
24.0
# # ALT.
# from math import factorial
# factorial(4)

The factorial of \(0\) is defined as 1: \(0!=1\).

factorial(0)
1.0

The factorial function grows grows very quickly:

[factorial(n) for n in [5, 6, 7, 8, 9, 10, 11, 12, 13]]
[120.0,
 720.0,
 5040.0,
 40320.0,
 362880.0,
 3628800.0,
 39916800.0,
 479001600.0,
 6227020800.0]

Combinations#

TODO explain + math formula

from scipy.special import comb

comb(5, 2)
10.0
from itertools import combinations

list(combinations({1,2,3,4,5}, 2))
[(1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 4),
 (3, 5),
 (4, 5)]

Exercises#

from scipy.special import comb

comb(12,6)
924.0
from scipy.special import comb

comb(52,5)
2598960.0

Binomial distribution#

We’ll use the name rvX because rvB was already used for the Bernoulli random variable above.

from scipy.stats import binom

n = 20
p = 0.14
rvX = binom(n,p)
rvX.support()
(0, 20)
rvX.mean(), rvX.var()
(2.8000000000000003, 2.4080000000000004)
plot_pmf(rvX, xlims=[0,21]);

Poisson distribution#

from scipy.stats import poisson
lam = 10
rvP = poisson(lam)
rvP.pmf(8)
0.11259903214902009
rvP.cdf(8)
0.3328196787507191
## ALT. way to compute the value F_P(8) =
# sum([rvP.pmf(x) for x in range(0,8+1)])
plot_pmf(rvP, xlims=[0,30]);

Geometric distribution#

from scipy.stats import geom

rvG = geom(p = 0.2)
rvG.support()
(1, inf)
rvG.mean(), rvG.var()
(5.0, 20.0)
plot_pmf(rvG, xlims=[0,30]);

Negative binomial distribution#

from scipy.stats import nbinom

r = 10
p = 0.5
rvN = nbinom(r,p)
rvN.support()
(0, inf)
rvN.mean(), rvN.var()
(10.0, 20.0)
plot_pmf(rvN, xlims=[0,30]);

Computer models for discrete distributions#

TODO table models

TODO table methods

Building computer models for random variables#

# import the model family
from scipy.stats import randint

# choose parameters
alpha = 1  # start at 1
beta = 4   # stop at 4

# create the rv object
rvR = randint(alpha, beta+1)

Note the +1 we added to the second argument.

rvR.mean()
2.5
# verify using formula
(alpha + beta) / 2
2.5
rvR.var()
1.25
# verify using formula
((beta + 1 - alpha)**2 - 1) / 12
1.25
rvR.std()
1.118033988749895

The limits of the sample space of the random variable rvR can be obtained by calling its .support() method.

rvR.support()
(1, 4)

Probability calculations#

rvR.pmf(1) + rvR.pmf(2) + rvR.pmf(3)
0.75
[rvR.pmf(r) for r in range(1,3+1)]
[0.25, 0.25, 0.25]
sum([rvR.pmf(r) for r in range(1,3+1)])
0.75
rvR.cdf(3)
0.75

Plotting distributions#

Probability mass function#

To create a stem-plot of the probability mass function \(f_R\), we can use the following three-step procedure:

  1. Create a range of inputs rs for the plot.

  2. Compute the value of \(f_R =\) rvR for each of the inputs and store the results as list of values fRs.

  3. Plot the values fRs by calling the function plt.stem(rs, fRs).

import matplotlib.pyplot as plt

rs = range(0, 8+1)
fRs = rvR.pmf(rs)
fRs = np.where(fRs == 0, np.nan, fRs)  # set zero fRs to np.nan
plt.stem(rs, fRs, basefmt=" ");

Note, we used the np.where statement to replace zero entries in the list fRs to the special Not a Number (np.nan) values, which excludes them from the plot. This is arguably a cleaner representation of the PMF plot: the function is only defined for the sample space \(\mathcal{R} = \{1,2,3,4\}\).

An alternative ,,,

# ALT. using helper function
from ministats import plot_pmf

plot_pmf(rvR, xlims=[0,8+1], rv_name="R");

Cumulative distribution function#

import numpy as np
import seaborn as sns

bs = np.linspace(0, 8, 1000)
FRs = rvR.cdf(bs)
sns.lineplot(x=bs, y=FRs);
# ALT. using helper function
from ministats import plot_cdf

plot_cdf(rvR, xlims=[0,8], rv_name="R");

Generating random observations#

Let’s generate 10 random observations from random variable rvR:

np.random.seed(43)

rvR.rvs(10)
array([1, 1, 4, 2, 2, 3, 1, 4, 2, 4])

Explanations#

Binomial coefficient formula explained#

n = 6

[int(comb(n,k)) for k in range(0,n+1)]
[1, 6, 15, 20, 15, 6, 1]

We can draw Pascal’s triangle by by repeating the above calculation for several values of n.

for n in range(0,10):
    row_ints = [int(comb(n,k)) for k in range(0,n+1)]
    row_str = str(row_ints)
    row_str = row_str.replace("[","").replace("]","").replace(",", "  ")
    row_str = row_str.center(60)
    print(row_str)
                             1                              
                           1   1                            
                         1   2   1                          
                       1   3   3   1                        
                     1   4   6   4   1                      
                  1   5   10   10   5   1                   
                1   6   15   20   15   6   1                
             1   7   21   35   35   21   7   1              
           1   8   28   56   70   56   28   8   1           
       1   9   36   84   126   126   84   36   9   1        

Geometric sums#

r = 0.3

sum([r**n for n in range(0,100)])
1.4285714285714286
1 / (1 - r)
1.4285714285714286

Discussion#

Relations between distributions#

Exercises#

Exercise XX#

from scipy.stats import poisson
lam = 20
rvH = poisson(lam)
# a) Pr(H in {20,21,22}) 
rvH.pmf(20) + rvH.pmf(21) + rvH.pmf(22)
0.2503540762867841
# b) Pr({16≤H≤24})
rvH.cdf(24) - rvH.cdf(16-1)
0.6867142435340193
# c) F_H^{-1}(0.95)
rvH.ppf(0.95)
28.0

Links#