Section 2.2 — Multiple random variables#

This notebook contains all the code examples from Section 2.2 Multiple random variables in the No Bullshit Guide to Statistics.

Notebook setup#

# Ensure required Python modules are installed
%pip install --quiet numpy scipy seaborn
[notice] A new release of pip is available: 26.0.1 -> 26.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
# load Python modules
import matplotlib.pyplot as plt  # generic plotting functions
import numpy as np               # numerical calculations
import pandas as pd              # data frames to store joint PMFs
import seaborn as sns            # plotting distributions
# Figures setup
plt.clf()  # needed otherwise `sns.set_theme` doesn't work
sns.set_theme(
    context="paper",
    style="whitegrid",
    palette="colorblind",
    rc={"font.family": "serif",
        "font.serif": ["Palatino", "DejaVu Serif", "serif"],
        "figure.figsize": (5,3)},
)
%config InlineBackend.figure_format = 'retina'
<Figure size 640x480 with 0 Axes>
# Simple float __repr__  
import numpy as np
if int(np.__version__.split(".")[0]) >= 2:
    np.set_printoptions(legacy='1.25')

# set random seed for repeatability
np.random.seed(42)

Definitions#

jpdfXY = pd.DataFrame(data=[[ 0.1, 0.05, 0.02, 0.01, 0.01],
                            [0.06, 0.12, 0.06, 0.03, 0.03],
                            [0.02, 0.06, 0.12, 0.07, 0.02],
                            [0.01, 0.01, 0.05,  0.1, 0.05]],
                        index=["a", "b", "c", "d"],
                        columns=[1,2,3,4,5])
jpdfXY.columns.name = "X"
jpdfXY.index.name = "Y"

print(jpdfXY.sum().sum())
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfXY);
0.9999999999999999
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[5], line 11
      7 jpdfXY.columns.name = "X"
      8 jpdfXY.index.name = "Y"
      9 
     10 print(jpdfXY.sum().sum())
---> 11 from ministats.plots.probability import plot_joint_pdf_dots
     12 plot_joint_pdf_dots(jpdfXY);

ImportError: cannot import name 'plot_joint_pdf_dots' from 'ministats.plots.probability' (/opt/hostedtoolcache/Python/3.12.13/x64/lib/python3.12/site-packages/ministats/plots/probability.py)
print(sum(jpdfXY.loc["b"]))
print(jpdfXY.loc["b"].values)
print(jpdfXY.loc["b"].values / sum(jpdfXY.loc["b"]))
0.3
[0.06 0.12 0.06 0.03 0.03]
[0.2 0.4 0.2 0.1 0.1]
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfXY.T, zmax=0.2)
ax.view_init(elev=30, azim=-50)
ax.text2D(1.19, 0.5, "$f_{XY}$", transform=ax.transAxes,
          fontsize=plt.rcParams["axes.labelsize"],
          rotation=90, va="center", ha="center");
from ministats.book.tables import joint_pdf_to_array
joint_pdf_to_array(jpdfXY, sigfigs=3)
\[\begin{split} \begin{array}{c|ccccc} f_{XY} & X=1 & X=2 & X=3 & X=4 & X=5 \\ \hline Y=a & 0.1 & 0.05 & 0.02 & 0.01 & 0.01 \\ Y=b & 0.06 & 0.12 & 0.06 & 0.03 & 0.03 \\ Y=c & 0.02 & 0.06 & 0.12 & 0.07 & 0.02 \\ Y=d & 0.01 & 0.01 & 0.05 & 0.1 & 0.05 \\ \end{array} \end{split}\]
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfXY, highlight=[[(1,"b"),(5,"b")]]);

Example 1: two coin tosses#

def fC(c):
    if c in ["heads", "tails"]:
        return 1/2
    else:
        return 0

def fC1C2(c1,c2):
    return fC(c1) * fC(c2)


for c1 in ["heads", "tails"]:
    for c2 in ["heads", "tails"]:
        print("Pr(" + c1 + "," + c2 + ") =", fC1C2(c1,c2))
Pr(heads,heads) = 0.25
Pr(heads,tails) = 0.25
Pr(tails,heads) = 0.25
Pr(tails,tails) = 0.25
import pandas as pd

#        C1=heads    C1=tails
pC1C2 = [[0.25,      0.25   ],   # C2=heads
         [0.25,      0.25   ]]   # C2=tails

jpdfC1C2 = pd.DataFrame(data=pC1C2,
                        columns=["heads", "tails"],
                        index=["heads", "tails"])
jpdfC1C2.columns.name = "C_1"
jpdfC1C2.index.name = "C_2"
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfC1C2);
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfC1C2, zmax=0.3)
ax.text2D(1.15, 0.56, "$f_{C_1C_2}$", transform=ax.transAxes,
          fontsize=plt.rcParams["axes.labelsize"],
          rotation=90, va="center", ha="center");

Example 2: rolling two dice#

def fD(d):
    if d in [1,2,3,4,5,6]:
        return 1/6
    else:
        return 0

def fD1D2(d1,d2):
    return fD(d1) * fD(d2)
#         D1=1  D1=2  D1=3  D1=4  D1=5  D1=6
pD1D2 = [[1/36, 1/36, 1/36, 1/36, 1/36, 1/36],  # D2=1
         [1/36, 1/36, 1/36, 1/36, 1/36, 1/36],  # D2=2
         [1/36, 1/36, 1/36, 1/36, 1/36, 1/36],  # D2=3
         [1/36, 1/36, 1/36, 1/36, 1/36, 1/36],  # D2=4
         [1/36, 1/36, 1/36, 1/36, 1/36, 1/36],  # D2=5
         [1/36, 1/36, 1/36, 1/36, 1/36, 1/36]]  # D2=6
jpdfD1D2 = pd.DataFrame(data=pD1D2,
                        columns=[1,2,3,4,5,6],
                        index=[1,2,3,4,5,6])

from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfD1D2, zmax=0.045);
ax.text2D(1.15, 0.56, "$f_{D_1D_2}$", transform=ax.transAxes,
          fontsize=plt.rcParams["axes.labelsize"],
          rotation=90, va="center", ha="center");
from ministats.plots.probability import plot_joint_pdf_dots
ax = plot_joint_pdf_dots(jpdfD1D2)
ax.set_xlabel("$d_1$")
ax.set_ylabel("$d_2$");
# CUT
# # PLOT D_1 + D_2 = 7
# xs, ys, fXYs = [], [], []
# for x in range(1,6+1):
#     y = 7 - x
#     xs.append(x)
#     ys.append(y)
#     fXYxy = fD1D2(x,y)
#     fXYs.append(fXYxy)
# fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
# ax.stem(xs, ys, fXYs, basefmt=" ")
# ax.set_xlabel("$d_1$")
# ax.set_ylabel("$d_2$")
# ax.set_zlabel("$f_{D_1D_2}$")
# ax.set_zlim([0, 0.05]);
from ministats.plots.probability import plot_joint_pdf_dots
ax = plot_joint_pdf_dots(jpdfD1D2, highlight=[(1,6),(2,5),(3,4),(4,3),(5,2),(6,1)]);
ax.set_xlabel("$d_1$")
ax.set_ylabel("$d_2$");

Example 3: coin-dependent dice roll#

import pandas as pd
#       D=1  D=3  D=3  D=4  D=5  D=6
pDC = [[1/6, 1/6, 1/6, 1/6, 1/6, 1/6],  # C=heads
       [1/4, 1/4, 1/4, 1/4,   0,   0]]  # C=tails
jpdfDC = pd.DataFrame(data=pDC,
                      index=["heads","tails"],
                      columns=[1,2,3,4,5,6])
jpdfDC.columns.name = "D"
jpdfDC.index.name = "C"
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfDC);
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfDC.T)
ax.text2D(1.15, 0.56, "$f_{DC}$", transform=ax.transAxes,
          fontsize=plt.rcParams["axes.labelsize"],
          rotation=90, va="center", ha="center");
from ministats.book.tables import joint_pdf_to_array
joint_pdf_to_array(jpdfDC, sigfigs=3)
\[\begin{split} \begin{array}{c|cccccc} f_{DC} & D=1 & D=2 & D=3 & D=4 & D=5 & D=6 \\ \hline C=\texttt{heads} & 0.167 & 0.167 & 0.167 & 0.167 & 0.167 & 0.167 \\ C=\texttt{tails} & 0.25 & 0.25 & 0.25 & 0.25 & 0 & 0 \\ \end{array} \end{split}\]
def fC(c):
    if c in ["heads", "tails"]:
        return 1/2
    else:
        return 0

def fD6(d):
    if d in [1,2,3,4,5,6]:
        return 1/6
    else:
        return 0

def fD4(d):
    if d in [1,2,3,4]:
        return 1/4
    else:
        return 0

def fDC(d,c):
    if c == "heads":
        return fC(c) * fD6(d)
    elif c == "tails":
        return fC(c) * fD4(d)

for c in ["heads", "tails"]:
    for d in [1,2,3,4,5,6]:
        print("Pr(" + str(d) + "," + c + ") =", round(fDC(d,c),4))
Pr(1,heads) = 0.0833
Pr(2,heads) = 0.0833
Pr(3,heads) = 0.0833
Pr(4,heads) = 0.0833
Pr(5,heads) = 0.0833
Pr(6,heads) = 0.0833
Pr(1,tails) = 0.125
Pr(2,tails) = 0.125
Pr(3,tails) = 0.125
Pr(4,tails) = 0.125
Pr(5,tails) = 0.0
Pr(6,tails) = 0.0
sum(fDC(d,c) for d in [1,2,3,4,5,6] for c in ["heads", "tails"])
1.0
# f_{Y|X}(y|heads)  [as a list]
[fDC(d,"heads") for d in range(1,6+1)]
[0.08333333333333333,
 0.08333333333333333,
 0.08333333333333333,
 0.08333333333333333,
 0.08333333333333333,
 0.08333333333333333]
# f_{Y|X}(y|tails)  [as a list]
[fDC(d,"tails") for d in range(1,6+1)]
[0.125, 0.125, 0.125, 0.125, 0.0, 0.0]
# f_Y(y)    [as a list]
fDs = [fDC(d,"heads")+fDC(d,"tails") for d in range(1,6+1)]
fDs
[0.20833333333333331,
 0.20833333333333331,
 0.20833333333333331,
 0.20833333333333331,
 0.08333333333333333,
 0.08333333333333333]
# ALT. compute f_{D|C}(d|heads)*0.5 + f_{D|C}(d|tails)*0.5
fDs = [fD6(d)*0.5+fD4(d)*0.5 for d in range(1,6+1)]
fDs
[0.20833333333333331,
 0.20833333333333331,
 0.20833333333333331,
 0.20833333333333331,
 0.08333333333333333,
 0.08333333333333333]
sum(fDs)
0.9999999999999999

Example 4: diagnostic test#

def fV(v):
    if v == 1:
        return 0.03
    elif v == 0:
        return 0.97

def fTgivenV1(t):
    if t == 1:
        return 0.90
    elif t == 0:
        return 0.10

def fTgivenV0(t):
    if t == 1:
        return 0.20
    elif t == 0:
        return 0.80

def fTgivenV(t,v):
    if v == 1:
        return fTgivenV1(t)
    elif v == 0:
        return fTgivenV0(t)

def fTV(t,v):
    return fTgivenV(t,v) * fV(v)

for v in range(0,1+1):
    for t in range(0,1+1):
        print("Pr({T=" + str(t) + ",V=" + str(v)+"})",
              "=", fTgivenV(t,v), "*", fV(v),
              "=", round(fTV(t,v),4))
Pr({T=0,V=0}) = 0.8 * 0.97 = 0.776
Pr({T=1,V=0}) = 0.2 * 0.97 = 0.194
Pr({T=0,V=1}) = 0.1 * 0.03 = 0.003
Pr({T=1,V=1}) = 0.9 * 0.03 = 0.027
import pandas as pd

#              T = 0           T = 1 
pTV = [[       0.8*0.97,  (1-0.8)*0.97],    #   V = 0
        [  (1-0.9)*0.03,      0.9*0.03]]    #   V = 1

jpdfTV = pd.DataFrame(data=pTV, columns=[0,1], index=[0,1])
jpdfTV.columns.name = "T"
jpdfTV.index.name = "V"
from ministats.plots.probability import plot_joint_pdf_dots
plot_joint_pdf_dots(jpdfTV);
from ministats.plots.probability import plot_joint_pdf_stems
ax = plot_joint_pdf_stems(jpdfTV.T)
ax.text2D(1.15, 0.56, "$f_{TV}$", transform=ax.transAxes,
          fontsize=plt.rcParams["axes.labelsize"],
          rotation=90, va="center", ha="center")
Text(1.15, 0.56, '$f_{TV}$')
../_images/8bfcfd7828a7a960ad4acbaddbce19172e4853ec5e1df7c7ba2d972e4ce513b9.png
from ministats.book.tables import joint_pdf_to_array
joint_pdf_to_array(jpdfTV, sigfigs=3)
\[\begin{split} \begin{array}{c|cc} f_{TV} & T=0 & T=1 \\ \hline V=0 & 0.776 & 0.194 \\ V=1 & 0.003 & 0.027 \\ \end{array} \end{split}\]
joint_pdf_to_array(jpdfTV, sigfigs=3, margins=True)
\[\begin{split} \begin{array}{c|cc|c} f_{TV} & T=0 & T=1 & f_{V} \\ \hline V=0 & 0.776 & 0.194 & 0.97 \\ V=1 & 0.003 & 0.027 & 0.03 \\ \hline f_{T} & 0.779 & 0.221 & \\ \end{array} \end{split}\]
joint_pdf_to_array(jpdfTV, sigfigs=3, margins=False, output="dataframe")
T 0 1
V
0 0.776 0.194
1 0.003 0.027
latex = joint_pdf_to_array(jpdfTV, sigfigs=3, margins=True, output="latex")
print(latex)
$$
\begin{array}{c|cc|c}
f_{TV} & T=0 & T=1 & f_{V} \\
\hline
V=0 & 0.776 & 0.194 & 0.97 \\
V=1 & 0.003 & 0.027 & 0.03 \\
\hline
f_{T} & 0.779 & 0.221 &  \\
\end{array}
$$
# f_T(0) = 
0.8 * 0.97 + 0.10 * 0.03
0.779
# f_T(1) = 
0.2 * 0.97 + 0.90 * 0.03
0.221
for t in range(0,1+1):
    fTt = fTV(t,0) + fTV(t,1)
    print("Pr({T="+str(t)+"}) =", fTt)
Pr({T=0}) = 0.779
Pr({T=1}) = 0.221

Example 4: diagnostic test (continued)#

We’re now interested in \(f_{V|T}(v|t)\), which we can obtain using Bayes’ theorem.

\[ f_{V|T}(v|t) = \frac{ f_{T|V}(t|v) f_V(v) }{ f_{T|V}(t|0) f_V(0) + f_{T|V}(t|1) f_V(1)}. \]

We’ll now define the function fVgivenT that computes the numerator and denominator in this fraction, and returns the ratio.

def fVgivenT(v,t):
    num = fTgivenV(t,v) * fV(v)
    denom = fTgivenV(t,0) * fV(0) + fTgivenV(t,1) * fV(1)
    return num / denom

The probability of the patient having the virus \(f_{V|T}(1|t=1)\), given they tested positive is:

fVgivenT(1,1)
0.12217194570135746

Intuitively, this percentage is measuring the ratio of the probability of the outcome \(f_{VT}(1,1)\) divided by sum of the probabilities of all outcomes where positive test can occur: \(f_{VT}(1,1)+f_{VT}(0,1)\).

fTV(1,1) / (fTV(1,0) + fTV(1,1))
0.12217194570135746

Define datasets for the examples#

Example 1: Multivariable uniform#

xmin = 100
xmax = 200
ymin = 10
ymax = 20

# joint pdf of = randint(100, 200) x randint(10,20)
def fU(x,y):
    A = (xmax-xmin) * (ymax-ymin)
    if xmin <= x and x <= xmax and ymin <= y and y <= ymax:
        return 1/A
    else:
        return 0.0
fU(170, 12)
0.001

Discussion#

Dependence and independence#

Exercises#

Exercise: infer the coin output from the die roll#

\[ f_{X|Y}(\texttt{heads}|4) = \frac{ f_{Y|X}(4|\texttt{heads}) f_X(\texttt{heads}) }{ f_Y(4) } \]
fD(4) * fC("heads") / fDs[3]
0.4