Section 2.2 — Multiple random variables#
This notebook contains all the code examples from Section 2.2 Multiple random variables in the No Bullshit Guide to Statistics.
Notebook setup#
# Ensure required Python modules are installed
%pip install --quiet numpy scipy seaborn
[notice] A new release of pip is available: 26.0.1 -> 26.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
# load Python modules
import matplotlib.pyplot as plt # generic plotting functions
import numpy as np # numerical calculations
import pandas as pd # data frames to store joint PMFs
import seaborn as sns # plotting distributions
# Figures setup
plt.clf() # needed otherwise `sns.set_theme` doesn't work
sns.set_theme(
context="paper",
style="whitegrid",
palette="colorblind",
rc={"font.family": "serif",
"font.serif": ["Palatino", "DejaVu Serif", "serif"],
"figure.figsize": (5,3)},
)
%config InlineBackend.figure_format = 'retina'
<Figure size 640x480 with 0 Axes>
# Simple float __repr__
import numpy as np
if int(np.__version__.split(".")[0]) >= 2:
np.set_printoptions(legacy='1.25')
# set random seed for repeatability
np.random.seed(42)
Definitions#
jpdfXY = pd.DataFrame(data=[[ 0.1, 0.05, 0.02, 0.01, 0.01],
[0.06, 0.12, 0.06, 0.03, 0.03],
[0.02, 0.06, 0.12, 0.07, 0.02],
[0.01, 0.01, 0.05, 0.1, 0.05]],
index=["a", "b", "c", "d"],
columns=[1,2,3,4,5])
jpdfXY.columns.name = "X"
jpdfXY.index.name = "Y"
print(jpdfXY.sum().sum())
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfXY);
0.9999999999999999
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[5], line 11
7 jpdfXY.columns.name = "X"
8 jpdfXY.index.name = "Y"
9
10 print(jpdfXY.sum().sum())
---> 11 from ministats.plots.probability import plot_joint_pdf_dots
12 plot_joint_pdf_dots(jpdfXY);
ImportError: cannot import name 'plot_joint_pdf_dots' from 'ministats.plots.probability' (/opt/hostedtoolcache/Python/3.12.13/x64/lib/python3.12/site-packages/ministats/plots/probability.py)
print(sum(jpdfXY.loc["b"]))
print(jpdfXY.loc["b"].values)
print(jpdfXY.loc["b"].values / sum(jpdfXY.loc["b"]))
0.3
[0.06 0.12 0.06 0.03 0.03]
[0.2 0.4 0.2 0.1 0.1]
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfXY.T, zmax=0.2)
ax.view_init(elev=30, azim=-50)
ax.text2D(1.19, 0.5, "$f_{XY}$", transform=ax.transAxes,
fontsize=plt.rcParams["axes.labelsize"],
rotation=90, va="center", ha="center");
from ministats.book.tables import joint_pdf_to_array
joint_pdf_to_array(jpdfXY, sigfigs=3)
\[\begin{split}
\begin{array}{c|ccccc}
f_{XY} & X=1 & X=2 & X=3 & X=4 & X=5 \\
\hline
Y=a & 0.1 & 0.05 & 0.02 & 0.01 & 0.01 \\
Y=b & 0.06 & 0.12 & 0.06 & 0.03 & 0.03 \\
Y=c & 0.02 & 0.06 & 0.12 & 0.07 & 0.02 \\
Y=d & 0.01 & 0.01 & 0.05 & 0.1 & 0.05 \\
\end{array}
\end{split}\]
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfXY, highlight=[[(1,"b"),(5,"b")]]);
Example 1: two coin tosses#
def fC(c):
if c in ["heads", "tails"]:
return 1/2
else:
return 0
def fC1C2(c1,c2):
return fC(c1) * fC(c2)
for c1 in ["heads", "tails"]:
for c2 in ["heads", "tails"]:
print("Pr(" + c1 + "," + c2 + ") =", fC1C2(c1,c2))
Pr(heads,heads) = 0.25
Pr(heads,tails) = 0.25
Pr(tails,heads) = 0.25
Pr(tails,tails) = 0.25
import pandas as pd
# C1=heads C1=tails
pC1C2 = [[0.25, 0.25 ], # C2=heads
[0.25, 0.25 ]] # C2=tails
jpdfC1C2 = pd.DataFrame(data=pC1C2,
columns=["heads", "tails"],
index=["heads", "tails"])
jpdfC1C2.columns.name = "C_1"
jpdfC1C2.index.name = "C_2"
Example 2: rolling two dice#
def fD(d):
if d in [1,2,3,4,5,6]:
return 1/6
else:
return 0
def fD1D2(d1,d2):
return fD(d1) * fD(d2)
# D1=1 D1=2 D1=3 D1=4 D1=5 D1=6
pD1D2 = [[1/36, 1/36, 1/36, 1/36, 1/36, 1/36], # D2=1
[1/36, 1/36, 1/36, 1/36, 1/36, 1/36], # D2=2
[1/36, 1/36, 1/36, 1/36, 1/36, 1/36], # D2=3
[1/36, 1/36, 1/36, 1/36, 1/36, 1/36], # D2=4
[1/36, 1/36, 1/36, 1/36, 1/36, 1/36], # D2=5
[1/36, 1/36, 1/36, 1/36, 1/36, 1/36]] # D2=6
jpdfD1D2 = pd.DataFrame(data=pD1D2,
columns=[1,2,3,4,5,6],
index=[1,2,3,4,5,6])
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfD1D2, zmax=0.045);
ax.text2D(1.15, 0.56, "$f_{D_1D_2}$", transform=ax.transAxes,
fontsize=plt.rcParams["axes.labelsize"],
rotation=90, va="center", ha="center");
from ministats.plots.probability import plot_joint_pdf_dots
ax = plot_joint_pdf_dots(jpdfD1D2)
ax.set_xlabel("$d_1$")
ax.set_ylabel("$d_2$");
# CUT
# # PLOT D_1 + D_2 = 7
# xs, ys, fXYs = [], [], []
# for x in range(1,6+1):
# y = 7 - x
# xs.append(x)
# ys.append(y)
# fXYxy = fD1D2(x,y)
# fXYs.append(fXYxy)
# fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
# ax.stem(xs, ys, fXYs, basefmt=" ")
# ax.set_xlabel("$d_1$")
# ax.set_ylabel("$d_2$")
# ax.set_zlabel("$f_{D_1D_2}$")
# ax.set_zlim([0, 0.05]);
Example 3: coin-dependent dice roll#
import pandas as pd
# D=1 D=3 D=3 D=4 D=5 D=6
pDC = [[1/6, 1/6, 1/6, 1/6, 1/6, 1/6], # C=heads
[1/4, 1/4, 1/4, 1/4, 0, 0]] # C=tails
jpdfDC = pd.DataFrame(data=pDC,
index=["heads","tails"],
columns=[1,2,3,4,5,6])
jpdfDC.columns.name = "D"
jpdfDC.index.name = "C"
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfDC.T)
ax.text2D(1.15, 0.56, "$f_{DC}$", transform=ax.transAxes,
fontsize=plt.rcParams["axes.labelsize"],
rotation=90, va="center", ha="center");
from ministats.book.tables import joint_pdf_to_array
joint_pdf_to_array(jpdfDC, sigfigs=3)
\[\begin{split}
\begin{array}{c|cccccc}
f_{DC} & D=1 & D=2 & D=3 & D=4 & D=5 & D=6 \\
\hline
C=\texttt{heads} & 0.167 & 0.167 & 0.167 & 0.167 & 0.167 & 0.167 \\
C=\texttt{tails} & 0.25 & 0.25 & 0.25 & 0.25 & 0 & 0 \\
\end{array}
\end{split}\]
def fC(c):
if c in ["heads", "tails"]:
return 1/2
else:
return 0
def fD6(d):
if d in [1,2,3,4,5,6]:
return 1/6
else:
return 0
def fD4(d):
if d in [1,2,3,4]:
return 1/4
else:
return 0
def fDC(d,c):
if c == "heads":
return fC(c) * fD6(d)
elif c == "tails":
return fC(c) * fD4(d)
for c in ["heads", "tails"]:
for d in [1,2,3,4,5,6]:
print("Pr(" + str(d) + "," + c + ") =", round(fDC(d,c),4))
Pr(1,heads) = 0.0833
Pr(2,heads) = 0.0833
Pr(3,heads) = 0.0833
Pr(4,heads) = 0.0833
Pr(5,heads) = 0.0833
Pr(6,heads) = 0.0833
Pr(1,tails) = 0.125
Pr(2,tails) = 0.125
Pr(3,tails) = 0.125
Pr(4,tails) = 0.125
Pr(5,tails) = 0.0
Pr(6,tails) = 0.0
sum(fDC(d,c) for d in [1,2,3,4,5,6] for c in ["heads", "tails"])
1.0
# f_{Y|X}(y|heads) [as a list]
[fDC(d,"heads") for d in range(1,6+1)]
[0.08333333333333333,
0.08333333333333333,
0.08333333333333333,
0.08333333333333333,
0.08333333333333333,
0.08333333333333333]
# f_{Y|X}(y|tails) [as a list]
[fDC(d,"tails") for d in range(1,6+1)]
[0.125, 0.125, 0.125, 0.125, 0.0, 0.0]
# f_Y(y) [as a list]
fDs = [fDC(d,"heads")+fDC(d,"tails") for d in range(1,6+1)]
fDs
[0.20833333333333331,
0.20833333333333331,
0.20833333333333331,
0.20833333333333331,
0.08333333333333333,
0.08333333333333333]
# ALT. compute f_{D|C}(d|heads)*0.5 + f_{D|C}(d|tails)*0.5
fDs = [fD6(d)*0.5+fD4(d)*0.5 for d in range(1,6+1)]
fDs
[0.20833333333333331,
0.20833333333333331,
0.20833333333333331,
0.20833333333333331,
0.08333333333333333,
0.08333333333333333]
sum(fDs)
0.9999999999999999
Example 4: diagnostic test#
def fV(v):
if v == 1:
return 0.03
elif v == 0:
return 0.97
def fTgivenV1(t):
if t == 1:
return 0.90
elif t == 0:
return 0.10
def fTgivenV0(t):
if t == 1:
return 0.20
elif t == 0:
return 0.80
def fTgivenV(t,v):
if v == 1:
return fTgivenV1(t)
elif v == 0:
return fTgivenV0(t)
def fTV(t,v):
return fTgivenV(t,v) * fV(v)
for v in range(0,1+1):
for t in range(0,1+1):
print("Pr({T=" + str(t) + ",V=" + str(v)+"})",
"=", fTgivenV(t,v), "*", fV(v),
"=", round(fTV(t,v),4))
Pr({T=0,V=0}) = 0.8 * 0.97 = 0.776
Pr({T=1,V=0}) = 0.2 * 0.97 = 0.194
Pr({T=0,V=1}) = 0.1 * 0.03 = 0.003
Pr({T=1,V=1}) = 0.9 * 0.03 = 0.027
import pandas as pd
# T = 0 T = 1
pTV = [[ 0.8*0.97, (1-0.8)*0.97], # V = 0
[ (1-0.9)*0.03, 0.9*0.03]] # V = 1
jpdfTV = pd.DataFrame(data=pTV, columns=[0,1], index=[0,1])
jpdfTV.columns.name = "T"
jpdfTV.index.name = "V"
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfTV.T)
ax.text2D(1.15, 0.56, "$f_{TV}$", transform=ax.transAxes,
fontsize=plt.rcParams["axes.labelsize"],
rotation=90, va="center", ha="center")
from ministats.book.tables import joint_pdf_to_array
joint_pdf_to_array(jpdfTV, sigfigs=3)
\[\begin{split}
\begin{array}{c|cc}
f_{TV} & T=0 & T=1 \\
\hline
V=0 & 0.776 & 0.194 \\
V=1 & 0.003 & 0.027 \\
\end{array}
\end{split}\]
joint_pdf_to_array(jpdfTV, sigfigs=3, margins=True)
\[\begin{split}
\begin{array}{c|cc|c}
f_{TV} & T=0 & T=1 & f_{V} \\
\hline
V=0 & 0.776 & 0.194 & 0.97 \\
V=1 & 0.003 & 0.027 & 0.03 \\
\hline
f_{T} & 0.779 & 0.221 & \\
\end{array}
\end{split}\]
joint_pdf_to_array(jpdfTV, sigfigs=3, margins=False, output="dataframe")
| T | 0 | 1 |
|---|---|---|
| V | ||
| 0 | 0.776 | 0.194 |
| 1 | 0.003 | 0.027 |
latex = joint_pdf_to_array(jpdfTV, sigfigs=3, margins=True, output="latex")
print(latex)
$$
\begin{array}{c|cc|c}
f_{TV} & T=0 & T=1 & f_{V} \\
\hline
V=0 & 0.776 & 0.194 & 0.97 \\
V=1 & 0.003 & 0.027 & 0.03 \\
\hline
f_{T} & 0.779 & 0.221 & \\
\end{array}
$$
# f_T(0) =
0.8 * 0.97 + 0.10 * 0.03
0.779
# f_T(1) =
0.2 * 0.97 + 0.90 * 0.03
0.221
for t in range(0,1+1):
fTt = fTV(t,0) + fTV(t,1)
print("Pr({T="+str(t)+"}) =", fTt)
Pr({T=0}) = 0.779
Pr({T=1}) = 0.221
Example 4: diagnostic test (continued)#
We’re now interested in \(f_{V|T}(v|t)\), which we can obtain using Bayes’ theorem.
\[
f_{V|T}(v|t)
= \frac{ f_{T|V}(t|v) f_V(v) }{ f_{T|V}(t|0) f_V(0) + f_{T|V}(t|1) f_V(1)}.
\]
We’ll now define the function fVgivenT that computes the numerator
and denominator in this fraction,
and returns the ratio.
def fVgivenT(v,t):
num = fTgivenV(t,v) * fV(v)
denom = fTgivenV(t,0) * fV(0) + fTgivenV(t,1) * fV(1)
return num / denom
The probability of the patient having the virus \(f_{V|T}(1|t=1)\), given they tested positive is:
fVgivenT(1,1)
0.12217194570135746
Intuitively, this percentage is measuring the ratio of the probability of the outcome \(f_{VT}(1,1)\) divided by sum of the probabilities of all outcomes where positive test can occur: \(f_{VT}(1,1)+f_{VT}(0,1)\).
fTV(1,1) / (fTV(1,0) + fTV(1,1))
0.12217194570135746
Define datasets for the examples#
Example 1: Multivariable uniform#
xmin = 100
xmax = 200
ymin = 10
ymax = 20
# joint pdf of = randint(100, 200) x randint(10,20)
def fU(x,y):
A = (xmax-xmin) * (ymax-ymin)
if xmin <= x and x <= xmax and ymin <= y and y <= ymax:
return 1/A
else:
return 0.0
fU(170, 12)
0.001
Discussion#
Dependence and independence#
Exercises#
Exercise: infer the coin output from the die roll#
\[
f_{X|Y}(\texttt{heads}|4) = \frac{ f_{Y|X}(4|\texttt{heads}) f_X(\texttt{heads}) }{ f_Y(4) }
\]
fD(4) * fC("heads") / fDs[3]
0.4