Section 2.3 — Inventory of discrete distributions#
This notebook contains all the code examples from Section 2.3 Inventory of discrete distributions of the No Bullshit Guide to Statistics.
Notebook setup#
# load Python modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Figures setup
sns.set_theme(
context="paper",
style="whitegrid",
palette="colorblind",
rc={'figure.figsize': (7,5)},
)
%config InlineBackend.figure_format = 'retina'
# set random seed for repeatability
np.random.seed(42)
# Download the `plot_helpers.py` module from the book's main github repo:
import os, requests
if not os.path.exists("plot_helpers.py"):
resp = requests.get("https://raw.githubusercontent.com/minireference/noBSstat/main/notebooks/plot_helpers.py")
with open("plot_helpers.py", "w") as f:
f.write(resp.text)
print("Downloaded `plot_helpers.py` module to current directory:", os.getcwd())
else:
print("You already have plot_helpers.py, so we can proceed.")
from plot_helpers import plot_pmf
from plot_helpers import plot_cdf
You already have plot_helpers.py, so we can proceed.
Definitions#
Math prerequisites#
Combinatorics#
See SciPy docs:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.factorial.html
https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.perm.html
https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.comb.html
Factorial#
from scipy.special import factorial
# ALT.
# from math import factorial
factorial(4)
24.0
factorial(1), factorial(2), factorial(3)
(1.0, 2.0, 6.0)
[factorial(k) for k in [5,6,7,8,9,10,11,12]]
[120.0, 720.0, 5040.0, 40320.0, 362880.0, 3628800.0, 39916800.0, 479001600.0]
factorial(15)
1307674368000.0
import numpy as np
np.log(factorial(15))/np.log(10)
12.116499611123398
Permutations#
from scipy.special import perm
perm(5,2)
20.0
perm(5,1), perm(5,2), perm(5,3), perm(5,4), perm(5,5)
(5.0, 20.0, 60.0, 120.0, 120.0)
If you want to actually see, all the possible permutations we
use the permutations function from itertools
from itertools import permutations
n = 5
nitems = range(1,n+1)
k = 2
list(permutations(nitems, k))
[(1, 2),
(1, 3),
(1, 4),
(1, 5),
(2, 1),
(2, 3),
(2, 4),
(2, 5),
(3, 1),
(3, 2),
(3, 4),
(3, 5),
(4, 1),
(4, 2),
(4, 3),
(4, 5),
(5, 1),
(5, 2),
(5, 3),
(5, 4)]
Combinations#
from scipy.special import comb
comb(5,2)
10.0
from itertools import combinations
n = 5
k = 2
list(combinations(range(1,n+1), k))
[(1, 2),
(1, 3),
(1, 4),
(1, 5),
(2, 3),
(2, 4),
(2, 5),
(3, 4),
(3, 5),
(4, 5)]
Summations using basic Python#
N = 10
nums = range(1,N+1)
list(nums)
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
sum(nums)
55
N*(N+1)/2 # formula
55.0
squarednums = [num**2 for num in nums]
squarednums
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
sum(squarednums)
385
N*(N+1)*(2*N+1)/6 # formula
385.0
cubednums = [num**3 for num in nums]
cubednums
[1, 8, 27, 64, 125, 216, 343, 512, 729, 1000]
sum(cubednums)
3025
( N*(N+1)/2 )**2 # formula
3025.0
Sum of the geometric series#
a = 0.5
r = 0.5
sum([a*r**k for k in range(0,60)])
1.0
Summations using SymPy (bonus material)#
from sympy import symbols
from sympy import summation
from sympy import simplify
k, N, r = symbols("k N r")
Sum of arithmetic sequence#
a_k = k
summation(a_k, (k,0, N))
simplify(summation(a_k, (k,0, N)))
Sum of squares#
b_k = k**2
simplify(summation(b_k, (k,0, N)))
Sum of cubes#
c_k = k**3
simplify(summation(c_k, (k,0, N)))
Geometric series#
g_k = r**k
summation(g_k, (k,0, N))
# from sympy import limit, oo
# limit( (1-r**(N+1))/(1-r), N, oo)
# # doesn't work; need to specify assumption r < 1
Discrete distributions reference#
Discrete uniform#
# import the model family
from scipy.stats import randint
# choose parameters
alpha = 1 # start at
beta = 4 # stop at
# create the rv object
rvU = randint(alpha, beta+1)
# use one of the rv object's methods
The limits of the sample space of the random variable rvU
can be obtained by calling its .support()
method.
rvU.support()
(1, 4)
rvU.mean()
2.5
rvU.var()
1.25
rvU.std() # = np.sqrt(rvU.var())
1.118033988749895
Probability mass function#
for x in range(1,4+1):
print("f_U(",x,") = ", rvU.pmf(x))
f_U( 1 ) = 0.25
f_U( 2 ) = 0.25
f_U( 3 ) = 0.25
f_U( 4 ) = 0.25
To create a stem-plot of the probability mass function \(f_U\), we can use the following three-step procedure:
Create a range of inputs
xs
for the plot.Compute the value of \(f_U =\)
rvU
for each of the inputs and store the results as list of valuesfUs
.Plot the values
fUs
by calling the functionplt.stem(xs,fUs)
.
import numpy as np
import matplotlib.pyplot as plt
xs = np.arange(0,8+1)
fUs = rvU.pmf(xs)
plt.stem(xs, fUs, basefmt=" ")
<StemContainer object of 3 artists>

# ALT
plot_pmf(rvU, xlims=[0,8+1])
<Axes: xlabel='x', ylabel='$f_{X}$'>

Cumulative distribution function#
for b in range(1,4+1):
print("F_U(", b, ") = ", rvU.cdf(b))
F_U( 1 ) = 0.25
F_U( 2 ) = 0.5
F_U( 3 ) = 0.75
F_U( 4 ) = 1.0
import numpy as np
import seaborn as sns
xs = np.linspace(0,8,1000)
FUs = rvU.cdf(xs)
sns.lineplot(x=xs, y=FUs)
<Axes: >

# ALT
plot_cdf(rvU, xlims=[0,8], rv_name="U")
<Axes: xlabel='u', ylabel='$F_{U}$'>

Let’s generate 10 random observations from random variable rvU
:
rvU.rvs(10)
array([3, 4, 1, 3, 3, 4, 1, 1, 3, 2])
Bernoulli#
from scipy.stats import bernoulli
rvB = bernoulli(p=0.3)
rvB.support()
(0, 1)
rvB.mean(), rvB.var()
(0.3, 0.21)
rvB.rvs(10)
array([0, 0, 1, 0, 1, 0, 1, 1, 0, 0])
_ = plot_pmf(rvB, xlims=[0,5])

Poisson#
from scipy.stats import poisson
lam = 10
rvP = poisson(lam)
rvP.pmf(8)
0.11259903214902009
rvP.cdf(8)
0.3328196787507191
## ALT. way to compute the value F_P(8) =
# sum([rvP.pmf(x) for x in range(0,8+1)])
_ = plot_pmf(rvP, xlims=[0,30])

Binomial#
We’ll use the name rvX
because rvB
was already used for the Bernoulli random variable above.
from scipy.stats import binom
n = 20
p = 0.14
rvX = binom(n,p)
rvX.support()
(0, 20)
rvX.mean(), rvX.var()
(2.8000000000000003, 2.4080000000000004)
_ = plot_pmf(rvX, xlims=[0,30])

Geometric#
from scipy.stats import geom
rvG = geom(p = 0.2)
rvG.support()
(1, inf)
rvG.mean(), rvG.var()
(5.0, 20.0)
_ = plot_pmf(rvG, xlims=[0,40])

Negative binomial#
from scipy.stats import nbinom
r = 10
p = 0.5
rvN = nbinom(r,p)
rvN.support()
(0, inf)
rvN.mean(), rvN.var()
(10.0, 20.0)
_ = plot_pmf(rvN, xlims=[0,40])

Hypergeometric#
from scipy.stats import hypergeom
a = 30 # number of success balls
b = 40 # number of failure balls
n = 20 # how many we're drawing
rvH = hypergeom(a+b, a, n)
rvH.support()
(0, 20)
rvH.mean(), rvH.var()
(8.571428571428571, 3.54924578527063)
meanH, stdH = rvH.stats()
print("mean =", meanH, " std =", stdH)
mean = 8.571428571428571 std = 3.54924578527063
_ = plot_pmf(rvH, xlims=[0,30])

Tomatoes salad probabilities#
a = 3 # number of good tomatoes
b = 4 # number of rotten tomatoes
n = 2 # how many we're drawing
rvHe = hypergeom(a+b, a, n)
_ = plot_pmf(rvHe, xlims=[0,3])
rvHe.pmf(0), rvHe.pmf(1), rvHe.pmf(2)
(0.28571428571428575, 0.5714285714285715, 0.14285714285714288)

Number of dogs seen by Amy#
a = 7 # number dogs
b = 20 - 7 # number of other animals
n = 12 # how many "patients" Amy will see today
rvD = hypergeom(a+b, a, n)
# Pr of exactly five dogs
rvD.pmf(5)
0.2860681114551084
_ = plot_pmf(rvD, xlims=[0,10])

Multinomial#
See docs.
from scipy.stats import multinomial
n = 10
ps = [0.1, 0.5, 0.8]
rvM = multinomial(n,ps)
rvM.rvs()
array([[0, 6, 4]])
# TODO: 3D scatter plot of points in space
Modelling real-world data using probability#
TODO: add simple inference and plots