import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15, 8)
import numpy as np
import pandas as pd
from dsci345 import pretty

rng = np.random.default_rng()

df = pd.read_csv("data/housing.csv", index_col=0)
df

df['area'] -= 500
df['year'] -= 2010

import patsy
outcome, predictors = patsy.dmatrices("rent ~ 0 + hood + year + area", df)
fit = np.linalg.lstsq(predictors, outcome, rcond=None)

estimates = { k.lstrip('hood[').rstrip("]") : v for k, v in zip(predictors.design_info.column_names, fit[0].ravel())}

estimates

{'bethel': np.float64(556.0088681522523),
 'campus': np.float64(649.8851743401467),
 'churchill': np.float64(584.0968576228599),
 'river': np.float64(610.4626011683347),
 'whittaker': np.float64(643.2011227181563),
 'year': np.float64(31.21843122618344),
 'area': np.float64(0.10063620684218222)}

np.asarray(predictors)[:10,:]

array([[   0.,    0.,    0.,    1.,    0.,    4.,  -11.],
       [   1.,    0.,    0.,    0.,    0.,    0.,  110.],
       [   0.,    0.,    1.,    0.,    0.,    7., -115.],
       [   1.,    0.,    0.,    0.,    0.,    2.,  257.],
       [   1.,    0.,    0.,    0.,    0.,    9.,    4.],
       [   0.,    0.,    0.,    0.,    1.,    5.,  -16.],
       [   0.,    0.,    0.,    1.,    0.,    6.,   93.],
       [   1.,    0.,    0.,    0.,    0.,    4., -196.],
       [   1.,    0.,    0.,    0.,    0.,    3.,  163.],
       [   0.,    1.,    0.,    0.,    0.,    2.,   81.]])

df

def sim_rents(df, beta):
    # TODO: estimate sigma
    sim_df = df.copy()
    sim_df['rent'] = np.round(
        np.array([beta[h] for h in sim_df.hood]) + beta['year'] * sim_df.year + beta['area'] * sim_df.area
        + rng.normal(scale=150, size=len(df))
    )
    return sim_df

def fit_model(df):
    outcome, predictors = patsy.dmatrices("rent ~ 0 + hood + year + area", df)
    fit = np.linalg.lstsq(predictors, outcome, rcond=None)
    estimates = { k.lstrip('hood[').rstrip("]") : v for k, v in zip(predictors.design_info.column_names, fit[0].ravel())}
    return estimates

estimates

{'bethel': np.float64(556.0088681522523),
 'campus': np.float64(649.8851743401467),
 'churchill': np.float64(584.0968576228599),
 'river': np.float64(610.4626011683347),
 'whittaker': np.float64(643.2011227181563),
 'year': np.float64(31.21843122618344),
 'area': np.float64(0.10063620684218222)}

experiments = pd.DataFrame([
    fit_model(sim_rents(df, beta=estimates))
    for _ in range(1000)
])
experiments

plt.hist(experiments.bethel)
plt.xlabel("mean bethel rent"); plt.ylabel("frequency");
plt.axvline(estimates['bethel'], color='red');

estimates

{'bethel': np.float64(556.0088681522523),
 'campus': np.float64(649.8851743401467),
 'churchill': np.float64(584.0968576228599),
 'river': np.float64(610.4626011683347),
 'whittaker': np.float64(643.2011227181563),
 'year': np.float64(31.21843122618344),
 'area': np.float64(0.10063620684218222)}

experiments.quantile(0.975) - experiments.quantile(0.025)

bethel       77.541967
campus       85.234240
churchill    81.627153
river        81.111593
whittaker    81.389827
year         10.005483
area          0.205705
dtype: float64

# IN CLASS
x = np.array([176, 255,  53,  94,  61, 119,  42, 109,   0,  30])
med_x = np.median(x)

boot_meds = [np.median(rng.choice(x, size=10, replace=True)) for _ in range(1000)]

fig, ax = plt.subplots()
ax.hist(boot_meds, bins=pretty(boot_meds, 20))
ax.axvline(med_x, c='red');

# 95% confidence interval
(np.quantile(boot_meds, 0.025), np.quantile(boot_meds, 0.975))

(np.float64(41.5), np.float64(142.5))

def bootstrap(df):
    n = df.shape[0]
    return df.loc[rng.choice(n, n)]

boots = pd.DataFrame([
    fit_model(bootstrap(df))
    for _ in range(1000)
])
boots

plt.hist(boots.bethel)
plt.xlabel("mean bethel rent"); plt.ylabel("frequency");
plt.axvline(estimates['bethel'], color='red');

boots.quantile(0.975) - boots.quantile(0.025)

bethel       81.652334
campus       85.080632
churchill    82.110892
river        78.229027
whittaker    73.397059
year         10.116929
area          0.189463
dtype: float64

	hood	year	area	rent
0	river	2014	489.0	942.0
1	bethel	2010	610.0	370.0
2	churchill	2017	385.0	669.0
3	bethel	2012	757.0	624.0
4	bethel	2019	504.0	990.0
...	...	...	...	...
395	bethel	2012	238.0	579.0
396	campus	2016	287.0	759.0
397	churchill	2016	526.0	679.0
398	campus	2014	418.0	868.0
399	bethel	2018	386.0	983.0

	hood	year	area	rent
0	river	4	-11.0	942.0
1	bethel	0	110.0	370.0
2	churchill	7	-115.0	669.0
3	bethel	2	257.0	624.0
4	bethel	9	4.0	990.0
...	...	...	...	...
395	bethel	2	-262.0	579.0
396	campus	6	-213.0	759.0
397	churchill	6	26.0	679.0
398	campus	4	-82.0	868.0
399	bethel	8	-114.0	983.0

	bethel	campus	churchill	river	whittaker	year	area
0	575.902510	642.604986	592.798342	608.078287	697.180635	28.779585	0.113843
1	557.178776	660.341787	586.334886	628.076765	644.009012	31.364434	0.035168
2	563.137220	668.938388	582.832586	630.124568	644.535599	32.107249	0.092571
3	544.430006	648.920584	595.158346	598.784393	629.969406	32.162263	0.145045
4	555.357145	659.746291	606.171082	628.264967	642.935342	29.826930	0.095766
...	...	...	...	...	...	...	...
995	544.890722	634.565771	574.393009	595.705698	639.830165	33.709383	0.050875
996	562.275564	655.898478	587.764530	631.991290	635.503826	31.053371	0.188562
997	552.190036	666.794021	546.301255	605.447758	650.142317	31.925727	0.185316
998	587.795985	704.718662	597.207021	607.655886	653.100338	27.555325	0.115970
999	553.359636	652.144653	580.140778	664.082251	679.938914	26.670145	0.098013

	bethel	campus	churchill	river	whittaker	year	area
0	562.472230	607.100486	565.395919	590.802400	666.817819	31.778218	0.126840
1	529.192742	640.495432	593.828156	620.172014	627.839688	30.404554	0.155942
2	574.811506	655.464745	606.884968	649.632581	665.138330	29.370616	0.024600
3	532.740067	631.887855	583.826112	618.851433	663.472104	29.516356	0.124679
4	562.512608	666.967288	598.311342	623.518098	639.886522	29.613112	0.078457
...	...	...	...	...	...	...	...
995	569.668469	658.942641	574.674532	606.367452	622.423225	29.863527	0.136098
996	567.254152	662.668325	549.584022	588.496222	639.206063	34.323386	0.024327
997	559.954882	638.365579	599.215047	605.086299	663.665607	32.544599	0.088130
998	566.461808	682.573878	573.381698	613.338983	643.444693	31.993623	0.042107
999	564.783341	675.236614	576.947170	640.135260	628.659703	29.753753	0.149073

Hello again, Uncertainty¶

Prediction and Estimation¶

A third question¶

Uncertainty, by math¶

Reminder: confidence intervals¶

A model¶

Aside: how do we fit this model?¶

Uncertainty, by simulation¶

Uncertainty, by our bootstraps¶

The bootstrap¶

Exercise:¶