xx = np.linspace(-6, 6, 101)
plt.plot(xx, 1/(1 + np.exp(-xx)));

xvals = np.linspace(0, 12, 61)
pvals = 1/(1 + np.exp(-(xvals - 5)))
plt.plot(xvals, pvals)
plt.xlabel("height (inches)"); plt.ylabel("probability of being eaten");

n = 300
x = rng.uniform(low=0, high=12, size=n)
eaten = (rng.random(n) < 1/(1 + np.exp(-(x-5))))
fig, ax = plt.subplots()
[ax.axvline(k, c='lightgrey') for k in range(13)];
ax.scatter(x[eaten], 1.0 + rng.uniform(-.05, .05, np.sum(eaten)), marker = "x")
ax.scatter(x[~eaten], rng.uniform(-.05, .05, np.sum(~eaten)), marker = "o")
ax.plot(xvals, pvals)
ax.set_xlabel("height"); ax.set_ylabel("eaten?")
props = [np.mean(eaten[np.floor(x) == k]) for k in range(12)]
ax.scatter(np.arange(12) + 0.5, props, s=200);

from sklearn.linear_model import LogisticRegression
X = np.vstack([x]).T
lfit = LogisticRegression(penalty=None).fit(X, eaten)

lfit

LogisticRegression(penalty=None)

LogisticRegression(penalty=None)

lfit.coef_, lfit.intercept_

(array([[0.91616516]]), array([-4.35251161]))

x[:10], lfit.predict_proba(X)[:10]

(array([ 5.70975965, 11.15747462,  5.89007913,  5.20256335,  9.84027641,
         1.13402772,  3.61101554,  5.10076517, 10.02629683,  8.18884787]),
 array([[0.29347394, 0.70652606],
        [0.00281611, 0.99718389],
        [0.26042252, 0.73957748],
        [0.39797838, 0.60202162],
        [0.00935165, 0.99064835],
        [0.96489091, 0.03510909],
        [0.7396644 , 0.2603356 ],
        [0.4205218 , 0.5794782 ],
        [0.00789788, 0.99210212],
        [0.04109815, 0.95890185]]))

fig, ax = plt.subplots()
xx = np.linspace(0, 12, 121)
ax.plot(xx, lfit.predict_proba(np.column_stack([xx])), label=['predicted not eaten', 'predicted eaten'])
ax.plot(xx, 1/(1 + np.exp(-(xx - 5))), label='true relationship')
ax.set_xlabel("height (in)"); ax.set_ylabel("probability of being eaten")
ax.legend();

aa = np.linspace(50, 80, 101)
for s in [0, 10, 30, 50, 100]:
    plt.plot(aa, 1/(1 + np.exp(-(-4.1 + .02 * aa + .01 * s))), label=f"pack-years: {s}")
plt.xlabel("age"); plt.ylabel("cancer incidence")
plt.legend();

n = 10_000
age = np.round(rng.uniform(low=50, high=80, size=n), 1)
pack_years = np.round(rng.gamma(shape=6, scale=0.1*age), 0)

fig, (ax0, ax1) = plt.subplots(1, 2)
ax0.hist(age, bins=pretty(age, 40))
ax0.set_xlabel("age (years)"); ax0.set_ylabel("frequency")
ax1.hist(pack_years, bins=pretty(pack_years, 40))
ax1.set_xlabel("pack-years smoked"); ax1.set_ylabel("frequency");

plt.scatter(age, pack_years, s=1)
plt.xlabel("age (years)"); plt.ylabel("pack-years smoked");

theta = -4.1 + .02 * age + .01 * pack_years
prob_cancer =  1/(1 + np.exp(-theta))
cancer = rng.binomial(n=1, p=prob_cancer)

plt.scatter(theta, cancer + rng.uniform(low=-.05, high=.05, size=n), s=1);

from sklearn.linear_model import LogisticRegression
X = np.vstack([age, pack_years]).T
lfit = LogisticRegression(penalty='none').fit(X, cancer)
b_age, b_py = lfit.coef_[0]
print(f"The linear predictor is {lfit.intercept_[0]:.3} + {b_age:.3} * age + {b_py:.3} * pack_years .")

aa = np.linspace(50, 80, 101)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
for s, c in zip([0, 10, 30, 50, 100], colors):
    plt.plot(aa, 1/(1 + np.exp(-(-4.1 + .02 * aa + .01 * s))), label=f"pack-years: {s}", c=c)
    plt.plot(aa, lfit.predict_proba(np.vstack([aa, np.repeat(s, len(aa))]).T)[:,1], linestyle=":", c=c)
plt.xlabel("age"); plt.ylabel("cancer incidence")
plt.legend();

Generalized Linear Models¶

Linear models: recap¶

Generalizing¶

Ingredients¶

Logistic models¶

Example: keep your head down¶

Your turn¶

Example: incidence¶