import pandas as pd
import sklearn.decomposition
import plotnine as p9
from plotnine.data import penguinsPenguin PCA
Quick exercise: penguins
Have a look at the Palmer Penguins dataset (which is provided already in plotnine).
Workflow:
- load in data, remove NAs
- do PCA: include proportion variance explained
- make plots of the first two PCs, colored by categorical variables (species, island, sex, …)
- look at loadings (just do this by hand, no need to pull in the fancy
biplotfunction) to interpret the PCs
In class
Some code that will come in handy
def biplot(pcs, loadings, a=1, b=2, labels=None, color=None):
assert labels is None or color is None
f = (pcs.select_dtypes("float").std(axis=0) / loadings.select_dtypes("float").std(axis=0)).mean()
p = p9.ggplot(pcs, p9.aes(x=f"PC{a}", y=f"PC{b}"))
if labels is not None:
p += p9.geom_text(mapping=p9.aes(label=labels, color=labels), alpha=0.5)
elif color is not None:
p += p9.geom_point(mapping=p9.aes(color=color), alpha=0.5)
else:
p += p9.geom_point(color='grey', alpha=0.5)
p = ( p
+ p9.geom_segment(
data=loadings,
mapping=p9.aes(xend=f"PC{a} * {f}", yend=f"PC{b} * {f}"),
x=0, y=0, arrow=p9.arrow(),
color='red', alpha=0.75,
)
+ p9.geom_text(
data=loadings,
mapping=p9.aes(x=f"PC{a} * {f} * 1.2", y=f"PC{b} * {f} * 1.2", label="variable"),
color='red'
)
)
return p