def L(p):
    C = math.factorial(20) / (math.factorial(15) * math.factorial(5))
    return C * (p ** 15) * ((1-p) ** 5)

pvals = np.linspace(0, 1, 51)
plt.plot(pvals, L(pvals))
plt.xlabel("probability of heads (p)")
plt.ylabel("likelihood")
plt.axvline(0.75, c='red');

x = [112, 145, 131, 98, 104]
np.mean(x)

118.0

muvals = np.linspace(100, 140, 51)
x = np.array([112, 145, 131, 98, 104])

def normal_L(mu, sigma):
    C = np.sqrt(2 * math.pi * sigma**2)
    return np.prod(np.exp(- (x - mu)**2 / (2 * sigma**2)) / C)

sigvals = [5, 25]
fig, axes = plt.subplots(1, len(sigvals))
for sig, ax in zip(sigvals, axes):
    ax.plot(muvals, [normal_L(m, sig) for m in muvals])
    ax.axvline(np.mean(x), c='r')
    ax.set_title(f"sigma={sig}")

from scipy.stats import norm

mu, sigma = 150, 50
print(normal_L(mu, sigma), np.prod(norm.pdf(x, loc=mu, scale=sigma)))

8.552412635759266e-12 8.552412635759269e-12

fig, (ax1, ax2) = plt.subplots(ncols=2)

xx = np.linspace(0, 300, 601)
pdf_x = norm.pdf(xx, loc=mu, scale=sigma)
cdf_x = norm.cdf(xx, loc=mu, scale=sigma)
ax1.plot(xx, pdf_x)
ax2.plot(xx, cdf_x);

rain

array([ 17.,  99.,  79.,  24.,  32.,  45.,  93.,  83., 113.,  32.,   3.,
       114.,  71.,  98.,  56.,  31.,  95.,  31.,  76.,  83.,  70., 107.,
        23.,  40., 168.,  43.,  43.,  28.,  20.,  49.,  26.,  32.,  10.,
        58.,  52.,  35.,  81., 124.,  28.,  46.,  25., 120.,  24.,  17.,
        57.,  96.,  59., 100.,  33.,  74.,  27.,  33.,  33.,  35.,  76.,
       104.,  31.,  24.,  33.,  24.,  73.,  34.,  50.,  53.,  50.,  33.,
       106., 125.,  44.,  94.,  35.,  54.])

def g(a, b):
    return a + b

ab = (2, 3)
abd = {'a': 2, 'b': 3}
g(*ab), g(**abd)

(5, 5)

from scipy.stats import gamma

def logL(theta, k):
    lpdfs = gamma.logpdf(rain, a=k, scale=theta) # "a" is the shape parameter here
    return np.sum(lpdfs)

from scipy.optimize import minimize
max_L = minimize(lambda x: -logL(*x), x0=(24, 1.8))
mle_theta, mle_k = max_L['x']
max_L

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 348.18845385033353
        x: [ 2.118e+01  2.711e+00]
      nit: 10
      jac: [ 0.000e+00  0.000e+00]
 hess_inv: [[ 1.240e+01 -1.389e+00]
            [-1.389e+00  1.877e-01]]
     nfev: 42
     njev: 14

thetavals = dsci.pretty([10, 40], 20)
kvals = np.round(dsci.pretty([0.5, 3], 10), 2)
Lmap = np.array(
    [[logL(theta, k) for theta in thetavals] for k in kvals]
)

fig, ax = plt.subplots()
im = ax.contourf(thetavals, kvals, Lmap)
ax.scatter(mle_theta, mle_k, marker="*", s=500)
cbar = ax.figure.colorbar(im, ax=ax)
ax.set_xlabel("scale (theta)"); ax.set_ylabel("shape (k)")
ax.set_title(f"maximum at theta = {mle_theta:.2f}, k = {mle_k:.2f}");

obs_mean = np.mean(rain)
obs_sd = np.std(rain)
theta = obs_sd**2  / obs_mean - 1
k = obs_mean / theta
print(f"MOM estimates: theta = {theta:.2f}, k = {k:.2f}.")
print(f"MLE estimates: theta = {max_L['x'][0]:.2f}, k = {max_L['x'][1]:.2f}.")

MOM estimates: theta = 19.25, k = 2.98.
MLE estimates: theta = 21.18, k = 2.71.

print(f"The observed mean rainfall/hour is {obs_mean:.3f} mm and the MLE-predicted mean is {(mle_theta * mle_k):.3f} mm.")

The observed mean rainfall/hour is 57.417 mm and the MLE-predicted mean is 57.417 mm.

rain

array([ 17.,  99.,  79.,  24.,  32.,  45.,  93.,  83., 113.,  32.,   3.,
       114.,  71.,  98.,  56.,  31.,  95.,  31.,  76.,  83.,  70., 107.,
        23.,  40., 168.,  43.,  43.,  28.,  20.,  49.,  26.,  32.,  10.,
        58.,  52.,  35.,  81., 124.,  28.,  46.,  25., 120.,  24.,  17.,
        57.,  96.,  59., 100.,  33.,  74.,  27.,  33.,  33.,  35.,  76.,
       104.,  31.,  24.,  33.,  24.,  73.,  34.,  50.,  53.,  50.,  33.,
       106., 125.,  44.,  94.,  35.,  54.])

def sim_rain():
    sim_x = rng.gamma(shape=mle_k, scale=mle_theta, size=len(rain))
    return sim_x

sim_rain()

array([ 32.15878681,  90.29979452, 120.64970166,  45.43758037,
        34.19000996,  67.57743256,  39.5384326 ,  64.00954305,
        52.14834585,  43.9065727 ,  46.80709265,  81.47900541,
        68.74555455,  56.99253199,  30.63696993,  38.91539422,
        95.92222192, 114.83519765, 152.17226352,  52.46765936,
        29.60144015,  28.49692812,  40.3675987 , 117.91079593,
        76.95123397,  26.15522173,  55.42350525,  22.43064679,
        13.29220479,  28.27998446,  83.12239949, 130.14478902,
        66.12441113,  97.01538698,  66.6420072 ,  76.79481803,
        54.89254181,  63.11907514,  10.50391286,  36.82967131,
       120.10053616,  34.35975191,  21.06767437,  92.82459644,
         2.84232213,  28.45267454,  57.14225809,  18.44652359,
         6.60766187,  99.52041706,  94.72170703,  55.32461894,
        53.48483275,  84.96605504,  79.65123689,  34.23844821,
        11.63599354,  16.04474254,  36.60528762,  50.94043331,
        71.47113594,  22.60486188,  36.01506343,  62.10031397,
        52.62537103,  64.87274384,   9.66274055,  36.76248675,
        80.39729592,  12.15988846,  31.37825464,  61.25565204])

num_reps = 4
fig, axes = plt.subplots(num_reps + 1, 1, figsize=(8, 15), sharex=True, sharey=True)

bins = dsci.pretty((0, 250), 30)
axes[0].hist(rain, bins=bins)
axes[0].set_title("real data")

for k in range(1, num_reps + 1):
    sim_x = sim_rain()
    axes[k].hist(sim_x, bins=bins)
    axes[k].set_title("simulated data")
    
axes[-1].set_xlabel("mm of rain/hr");

num_reps = 4
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)

for k in range(4):
    ax = axes.flatten()[k]
    sim_x = sim_rain()
    ax.scatter(np.sort(rain), np.sort(sim_x))
    ax.set_xlabel("real data")
    ax.set_ylabel("simulated data")
    ax.set_aspect(1)

def do_inference(rain):
    def logL(theta, k):
        lpdfs = gamma.logpdf(rain, a=k, scale=theta) # "a" is the shape parameter here
        return np.sum(lpdfs)

    max_L = minimize(lambda x: -logL(*x), x0=(24, 1.8))
    return max_L['x']

{'estimated': do_inference(sim_rain()), 'true value used in simulation': (mle_theta, mle_k)}

{'estimated': array([21.70029262,  2.41889354]),
 'true value used in simulation': (21.1799967312572, 2.71089115943792)}

num_reps = 200
estim_array = np.zeros((num_reps, 2))
for k in range(num_reps):
    estim_array[k,:] = do_inference(sim_rain())

fig, (ax0, ax1) = plt.subplots(1, 2)
ax0.hist(estim_array[:,0], bins=20)
ax0.axvline(mle_theta, label="true value", c='red')
ax0.set_xlabel("estimated values of theta")
ax0.legend()
ax1.hist(estim_array[:,1], bins=20)
ax1.axvline(mle_k, label="true value", c='red')
ax1.set_xlabel("estimated values of k")
ax1.legend();

Likelihood and model fitting¶

Guess the probability¶

Guess the mean¶

Likelihood surfaces¶

Example: the gamma distribution¶

Maximum likelihood¶

Goodness-of-fit, and uncertainty¶