import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15, 8)
import numpy as np
import pandas as pd
from dsci345 import pretty

rng = np.random.default_rng()

xx = np.linspace(-6, 6, 101)
plt.plot(xx, 1/(1 + np.exp(-xx)));

xvals = np.linspace(0, 12, 61)
pvals = 1/(1 + np.exp(-(xvals - 5)))
plt.plot(xvals, pvals)
plt.xlabel("height (inches)"); plt.ylabel("probability of being eaten");

n = 300
x = rng.uniform(low=0, high=12, size=n)
eaten = (rng.random(n) < 1/(1 + np.exp(-(x-5))))
fig, ax = plt.subplots()
[ax.axvline(k, c='lightgrey') for k in range(13)];
ax.scatter(x[eaten], 1.0 + rng.uniform(-.05, .05, np.sum(eaten)), marker = "x")
ax.scatter(x[~eaten], rng.uniform(-.05, .05, np.sum(~eaten)), marker = "o")
ax.plot(xvals, pvals)
ax.set_xlabel("height"); ax.set_ylabel("eaten?")
props = [np.mean(eaten[np.floor(x) == k]) for k in range(12)]
ax.scatter(np.arange(12) + 0.5, props, s=200);

from sklearn.linear_model import LogisticRegression
X = np.vstack([x]).T
lfit = LogisticRegression(penalty=None).fit(X, eaten)

lfit

LogisticRegression(penalty=None)

LogisticRegression(penalty=None)

lfit.coef_, lfit.intercept_

(array([[0.92826693]]), array([-4.18889562]))

x[:10], lfit.predict_proba(X)[:10]

(array([10.73287297,  2.08649262, 11.65510899,  3.99240506,  8.28596169,
         5.82241048,  4.99844798,  4.26507183,  4.03432138,  6.58957577]),
 array([[0.00309745, 0.99690255],
        [0.90482924, 0.09517076],
        [0.00131822, 0.99868178],
        [0.61842725, 0.38157275],
        [0.02923583, 0.97076417],
        [0.22866664, 0.77133336],
        [0.38912343, 0.61087657],
        [0.55719123, 0.44280877],
        [0.60920424, 0.39079576],
        [0.12697327, 0.87302673]]))

fig, ax = plt.subplots()
xx = np.linspace(0, 12, 121)
ax.plot(xx, lfit.predict_proba(np.column_stack([xx])), label=['predicted not eaten', 'predicted eaten'])
ax.plot(xx, 1/(1 + np.exp(-(xx - 5))), label='true relationship')
ax.set_xlabel("height (in)"); ax.set_ylabel("probability of being eaten")
ax.legend();

aa = np.linspace(50, 80, 101)
for s in [0, 10, 30, 50, 100]:
    plt.plot(aa, 1/(1 + np.exp(-(-4.1 + .02 * aa + .01 * s))), label=f"pack-years: {s}")
plt.xlabel("age"); plt.ylabel("cancer incidence")
plt.legend();

n = 10_000
age = np.round(rng.uniform(low=50, high=80, size=n), 1)
pack_years = np.round(rng.gamma(shape=6, scale=0.1*age), 0)

fig, (ax0, ax1) = plt.subplots(1, 2)
ax0.hist(age, bins=pretty(age, 40))
ax0.set_xlabel("age (years)"); ax0.set_ylabel("frequency")
ax1.hist(pack_years, bins=pretty(pack_years, 40))
ax1.set_xlabel("pack-years smoked"); ax1.set_ylabel("frequency");

plt.scatter(age, pack_years, s=1)
plt.xlabel("age (years)"); plt.ylabel("pack-years smoked");

theta = -4.1 + .02 * age + .01 * pack_years
prob_cancer =  1/(1 + np.exp(-theta))
cancer = rng.binomial(n=1, p=prob_cancer)

plt.scatter(theta, cancer + rng.uniform(low=-.05, high=.05, size=n), s=1);

from sklearn.linear_model import LogisticRegression
X = np.vstack([age, pack_years]).T
lfit = LogisticRegression(penalty='none').fit(X, cancer)
b_age, b_py = lfit.coef_[0]
print(f"The linear predictor is {lfit.intercept_[0]:.3} + {b_age:.3} * age + {b_py:.3} * pack_years .")

---------------------------------------------------------------------------
InvalidParameterError                     Traceback (most recent call last)
Cell In[14], line 3
      1 from sklearn.linear_model import LogisticRegression
      2 X = np.vstack([age, pack_years]).T
----> 3 lfit = LogisticRegression(penalty='none').fit(X, cancer)
      4 b_age, b_py = lfit.coef_[0]
      5 print(f"The linear predictor is {lfit.intercept_[0]:.3} + {b_age:.3} * age + {b_py:.3} * pack_years .")

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/base.py:1382, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1377 partial_fit_and_fitted = (
   1378     fit_method.__name__ == "partial_fit" and _is_fitted(estimator)
   1379 )
   1381 if not global_skip_validation and not partial_fit_and_fitted:
-> 1382     estimator._validate_params()
   1384 with config_context(
   1385     skip_parameter_validation=(
   1386         prefer_skip_nested_validation or global_skip_validation
   1387     )
   1388 ):
   1389     return fit_method(estimator, *args, **kwargs)

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/base.py:436, in BaseEstimator._validate_params(self)
    428 def _validate_params(self):
    429     """Validate types and values of constructor parameters
    430 
    431     The expected type and values must be defined in the `_parameter_constraints`
   (...)    434     accepted constraints.
    435     """
--> 436     validate_parameter_constraints(
    437         self._parameter_constraints,
    438         self.get_params(deep=False),
    439         caller_name=self.__class__.__name__,
    440     )

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:98, in validate_parameter_constraints(parameter_constraints, params, caller_name)
     92 else:
     93     constraints_str = (
     94         f"{', '.join([str(c) for c in constraints[:-1]])} or"
     95         f" {constraints[-1]}"
     96     )
---> 98 raise InvalidParameterError(
     99     f"The {param_name!r} parameter of {caller_name} must be"
    100     f" {constraints_str}. Got {param_val!r} instead."
    101 )

InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet'} or None. Got 'none' instead.

aa = np.linspace(50, 80, 101)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
for s, c in zip([0, 10, 30, 50, 100], colors):
    plt.plot(aa, 1/(1 + np.exp(-(-4.1 + .02 * aa + .01 * s))), label=f"pack-years: {s}", c=c)
    plt.plot(aa, lfit.predict_proba(np.vstack([aa, np.repeat(s, len(aa))]).T)[:,1], linestyle=":", c=c)
plt.xlabel("age"); plt.ylabel("cancer incidence")
plt.legend();

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[15], line 5
      3 for s, c in zip([0, 10, 30, 50, 100], colors):
      4     plt.plot(aa, 1/(1 + np.exp(-(-4.1 + .02 * aa + .01 * s))), label=f"pack-years: {s}", c=c)
----> 5     plt.plot(aa, lfit.predict_proba(np.vstack([aa, np.repeat(s, len(aa))]).T)[:,1], linestyle=":", c=c)
      6 plt.xlabel("age"); plt.ylabel("cancer incidence")
      7 plt.legend();

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1428, in LogisticRegression.predict_proba(self, X)
   1423 ovr = self.multi_class in ["ovr", "warn"] or (
   1424     self.multi_class in ["auto", "deprecated"]
   1425     and (self.classes_.size <= 2 or self.solver == "liblinear")
   1426 )
   1427 if ovr:
-> 1428     return super()._predict_proba_lr(X)
   1429 else:
   1430     decision = self.decision_function(X)

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/linear_model/_base.py:389, in LinearClassifierMixin._predict_proba_lr(self, X)
    382 def _predict_proba_lr(self, X):
    383     """Probability estimation for OvR logistic regression.
    384 
    385     Positive class probabilities are computed as
    386     1. / (1. + np.exp(-self.decision_function(X)));
    387     multiclass is handled by normalizing that over all classes.
    388     """
--> 389     prob = self.decision_function(X)
    390     expit(prob, out=prob)
    391     if prob.ndim == 1:

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/linear_model/_base.py:351, in LinearClassifierMixin.decision_function(self, X)
    348 check_is_fitted(self)
    349 xp, _ = get_namespace(X)
--> 351 X = validate_data(self, X, accept_sparse="csr", reset=False)
    352 scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
    353 return (
    354     xp.reshape(scores, (-1,))
    355     if (scores.ndim > 1 and scores.shape[1] == 1)
    356     else scores
    357 )

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/utils/validation.py:2965, in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
   2962     out = X, y
   2964 if not no_val_X and check_params.get("ensure_2d", True):
-> 2965     _check_n_features(_estimator, X, reset=reset)
   2967 return out

File ~/micromamba/envs/ds345/lib/python3.12/site-packages/sklearn/utils/validation.py:2829, in _check_n_features(estimator, X, reset)
   2826     return
   2828 if n_features != estimator.n_features_in_:
-> 2829     raise ValueError(
   2830         f"X has {n_features} features, but {estimator.__class__.__name__} "
   2831         f"is expecting {estimator.n_features_in_} features as input."
   2832     )

ValueError: X has 2 features, but LogisticRegression is expecting 1 features as input.

Generalized Linear Models¶

Linear models: recap¶

Generalizing¶

Ingredients¶

Logistic models¶

Example: keep your head down¶

Your turn¶

Example: incidence¶