import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15, 8)
import numpy as np
import pandas as pd
import patsy
from dsci345 import pretty

rng = np.random.default_rng(seed=123)

x = rng.uniform(high=10, size=101)
x.sort()
y = 10 * (1.2 * (x < np.pi) + 2 * np.logical_and(x > np.pi, x < 8) + (10 - x) * (x > 8)) - 12
y += rng.normal(scale=2, size=len(x))
df = pd.DataFrame({'t': x, 'y': y})

plt.scatter(df['t'], df['y'])
plt.xlabel("time (t)"); plt.ylabel("response (y)");

plt.scatter(df['t'], df['y'])
plt.plot(df['t'], df['y'])
plt.xlabel("time (t)"); plt.ylabel("response (y)");

ey = 10 * (1.2 * (x < np.pi) + 2 * np.logical_and(x > np.pi, x < 8) + (10 - x) * (x > 8)) - 12
plt.plot(df['t'], ey)
plt.scatter(df['t'], df['y'])
plt.xlabel("time (t)"); plt.ylabel("response (y)");

plt.plot(df['t'], np.convolve(ey, np.ones(11)/11)[5:-5])
plt.scatter(df['t'], df['y'])
plt.xlabel("time (t)"); plt.ylabel("response (y)");

s = np.hstack([
    (10/k) * patsy.dmatrix(f"bs(t, df=k, degree=3, include_intercept=True) - 1", df)
    for k in (4, 8, 12, 16, 24, 32, 48, 64)
])
 
plt.plot(df['t'], s);

coefs.shape, s.shape, df.shape

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 coefs.shape, s.shape, df.shape

NameError: name 'coefs' is not defined

coefs = rng.normal(size=(s.shape[1], 4))
plt.plot(df['t'], s.dot(coefs));

sfit, _, _, _ = np.linalg.lstsq(s, y, rcond=None)

plt.scatter(df['t'], df['y']);
plt.plot(df['t'], s.dot(sfit));

from sklearn.linear_model import Ridge

plt.scatter(df['t'], df['y'])
plt.plot(df['t'], s.dot(sfit), label='unpenalized')
for a in [0.1, 2, 50, 1000]:
    rfit = Ridge(alpha=a).fit(s, y)
    rpred = s.dot(rfit.coef_) + rfit.intercept_
    plt.plot(df['t'], rpred, label=f'ridge (alpha={a})');
plt.legend();

def do_xval(alpha, test):
    rfit = Ridge(alpha=alpha).fit(s[~test,:], y[~test])
    rpred = s.dot(rfit.coef_)
    return np.sqrt(np.mean((y[test] - rpred[test])**2))

def xval(alpha, folds):
    return np.mean([do_xval(alpha, folds==j) for j in np.unique(folds)])

folds = np.repeat(np.arange(10), 11)[:101]
rng.shuffle(folds)

avals = np.linspace(0.2, 5, 31)
mse = np.array([xval(a, folds) for a in avals])
a_min = avals[np.argmin(mse)]

plt.plot(avals, mse);
plt.scatter(a_min, mse[np.argmin(mse)])
plt.xlabel("alpha"); plt.ylabel("root mean squared testing error");

plt.scatter(df['t'], df['y'])
plt.plot(df['t'], s.dot(sfit), label='unpenalized')
rfit = Ridge(alpha=a_min).fit(s, y)
rpred = s.dot(rfit.coef_) + rfit.intercept_
plt.plot(df['t'], rpred, label=f'ridge (alpha={a_min:.4})');
plt.legend();

When there's too many right answers¶

A motivating problem: interpolation¶

A quick introduction to splines¶

Too many knobs¶

Regularization, again¶

What happened there?¶