import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(seed=123)

n = 100
age = rng.uniform(low=10, high=20, size=n) # in years

food = rng.gamma(shape=10, scale=8, size=n) # in percent
mean_length = 1 + .1 * age  - .15 *.25 * (100 - food)/10 # in m
length = rng.normal(loc=mean_length, scale=0.25, size=n)

plt.hist(food);

plt.scatter(age, mean_length)
plt.scatter(age, length, c=food)
plt.xlabel("age (years)"); plt.ylabel("length (m)")
plt.axline((10, 2), slope=.1) # mean size at 100% food
plt.colorbar(label='food avail');

np.corrcoef(age, length)

array([[1.        , 0.66506441],
       [0.66506441, 1.        ]])

a = (np.std(age, ddof=1) / np.std(length, ddof=1) 
     * np.corrcoef(age, length)[0,1])
b = np.mean(age) - a * np.mean(length)
age_hat = a * length + b

a, b

(np.float64(4.704069567707924), np.float64(3.464646893346048))

plt.scatter(length, age, c=food)
plt.scatter(length, age_hat, c='red')
plt.ylabel("age (years)"); plt.xlabel("length (m)")
plt.axline((2, 10), slope=10) # mean size at 100% food
plt.colorbar(label='food avail');

resid = age - age_hat
print(f"RMSE = {np.sqrt(np.mean(resid**2))} years")

RMSE = 2.1087153735840225 years

(x.T).dot(x).shape

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 (x.T).dot(x).shape

NameError: name 'x' is not defined

x = np.column_stack([
    np.ones(n),
    length,
    food
])
a = np.linalg.solve((x.T).dot(x), (x.T).dot(age))

a

array([ 4.5508148 ,  5.12404589, -0.02594002])

age_hat2 = x.dot(a)

resid2 = age - age_hat2
print(f"RMSE = {np.sqrt(np.mean(resid2**2))} years")

RMSE = 2.0013955204713203 years

Exercise: Estimating Elephants¶

The inference problem¶

Multivariate inference¶