import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15, 8)
import numpy as np
import pandas as pd

rng = np.random.default_rng(1234)

k = 20
n = 250
sexes = np.array(['f', 'm'])[1 * (rng.random(size=n) < 0.5)]
sizes = rng.normal(loc=1.0, scale=0.2, size=n)

mean_measurements = rng.gamma(shape=10, scale=10, size=k)
sex_diffs = np.sort(rng.normal(loc=10, scale=25, size=k)) # female minus male
sd_measurements = 20

X = rng.normal(loc=mean_measurements, scale=sd_measurements, size=(n, k))
X[sexes == 'f', :] += sex_diffs
X *= sizes[:, np.newaxis]

# PCA fig 1
colors = np.array(['r', 'b'])[1 * (sexes == 'f')]
fig, axes = plt.subplots(3, 3, figsize=(12,12))
for i in range(3):
    for j in range(3):
        ax = axes[i][j]
        if i == j:
            ax.hist(X[:,i])
        else:
            ax.scatter(X[:,i], X[:,j], c=colors)

covmat = np.cov(X.T)
evals, evecs = np.linalg.eig(covmat)
eord = np.argsort(evals)[::-1]
evals = evals[eord]
evecs = evecs[:,eord]
pcs = X.dot(evecs)

# PCA fig 2
perc_var = 100 * evals**2 / np.sum(evals**2)
plt.plot(perc_var)
plt.xlabel("principal component")
plt.ylabel("percent variance explained")
plt.xticks(np.arange(0, 20, 2));

# PCA fig 3
plt.scatter(pcs[:,0], pcs[:,1], c=np.array(['red', 'blue'])[1*(sexes == 'f')])
plt.scatter(pcs[0,0], pcs[0,1], marker="*", s=1000) # manatee number 0
plt.xlabel(f"PC 1 ({perc_var[0]:.0f}%)"); plt.ylabel(f"PC 2 ({perc_var[1]:.0f}%)");

# PCA fig 4
plt.scatter(pcs[:,0], np.mean(X, axis=1))
plt.xlabel("PC1"); plt.ylabel("average measurement");

# PCA fig 5
for k in np.arange(2):
    plt.plot(evecs[:, k], label=f"PC{k+1}")

plt.xlabel("variable number")
plt.ylabel("coefficient in the PC")
plt.legend();

from collections import defaultdict

pfile = open("data/passages.txt", "r")
passages = pfile.read().split("\n")[:-1]
sources = pd.read_table("data/passage_sources.tsv")
words = np.unique(" ".join(passages).split(" "))[1:]
def tabwords(x, words):
    d = defaultdict(int)
    for w in x.split(" "):
        d[w] += 1
    out = np.array([d[w] for w in words])
    return out

wordmat = np.array([tabwords(x, words) for x in passages])

passages[1]

'elinor encouraged her as much as possible to talk of what she felt and before breakfast was ready they had gone through the subject again and again and with the same steady conviction and affectionate counsel on elinors side the same impetuous feelings and varying opinions on mariannes as before sometimes she could believe willoughby to be as unfortunate and as innocent as herself and at others lost every consolation in the impossibility of acquitting him at one moment she was absolutely indifferent to the observation of all the world at another she would seclude herself from it for ever and at a third could resist it with energy in one thing however she was uniform when it came to the point in avoiding where it was possible the presence of mrs jennings and in a determined silence when obliged to endure it jenningss entering into her sorrows with any compassion her kindness is not sympathy her goodnature is not tenderness all that she wants is gossip and she only likes me now because i supply it elinor had not needed this to be assured of the injustice to which her sister was often led in her opinion of others by the irritable refinement of her own mind and the too great importance placed by her on the delicacies of a strong sensibility and the graces of a polished manner like half the rest of the world if more than half there be that are clever and good marianne with excellent abilities and an excellent disposition was neither reasonable nor candid she expected from other people the same opinions and feelings as her own and she judged of their motives by the immediate effect of their actions on herself thus a circumstance occurred while the sisters were together in their own room after breakfast which sunk the heart of mrs jennings still lower in her estimation because through her own weakness it chanced to prove a source of fresh pain to herself though mrs jennings was governed in it by an impulse of the utmost goodwill with a letter in her outstretched hand and countenance gaily smiling from the persuasion of bringing comfort she entered their room saying now my dear i bring you something that i am sure will do you good in one moment her imagination placed before her a letter from willoughby full of tenderness and contrition explanatory of all that had passed satisfactory convincing and instantly followed by willoughby himself rushing eagerly into the room to inforce at her feet by the eloquence of his eyes the assurances of his letter the hand writing of her mother never till then unwelcome was before her and in the acuteness of the disappointment which followed such an ecstasy of more than hope she felt as if till that instant she had never suffered jennings no language within her reach in her moments of happiest eloquence could have expressed and now she could reproach her only by the tears which streamed from her eyes with passionate violencea reproach however so entirely lost on its object that after many expressions of pity she withdrew still referring her to the letter of comfort but the letter when she was calm enough to read it brought little comfort her mother still confident of their engagement and relying as warmly as ever on his constancy had only been roused by elinors application to intreat from marianne greater openness towards them both and this with such tenderness towards her such affection for willoughby and such a conviction of their future happiness in each other that she wept with agony through the whole of it'

for w, x in zip(words[:20], wordmat[:20,:].T):
    print(f"{w}: {x}")

a: [15  9 12  8  7  7  5 12  8  3 18 12 13 11  2  2 12 14 11 14]
aback: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abaft: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abandon: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abandoned: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abandonment: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abased: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abasement: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abashed: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abate: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abated: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abatement: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abating: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abbeyland: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abbreviate: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abbreviation: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abeam: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abednego: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abhor: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
abhorred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

wordmat.shape

(1000, 19224)

from scipy.sparse.linalg import svds
# center and scale the data
x = wordmat - np.mean(wordmat, axis=1)[:,np.newaxis]
x /= np.std(x, axis=1)[:, np.newaxis]
pcs, evals, evecs = svds(x, k=3)
eord = np.argsort(evals)[::-1]
evals = evals[eord]
evecs = evecs[eord,:]
pcs = pcs[:,eord]

loadings = pd.DataFrame(evecs.T, columns=[f"PC{k}" for k in range(1,4)], index=words)
loadings

# PCA fig 6 & 7
fig, (ax0, ax1) = plt.subplots(1, 2)
for s in np.unique(sources['source']):
    ax0.scatter(pcs[sources['source'] == s,0], pcs[sources['source'] == s,1])
    ax1.scatter(pcs[sources['source'] == s,1], pcs[sources['source'] == s,2], label=s)

ax0.set_xlabel("PC 1"); ax0.set_ylabel("PC 2")
ax1.set_xlabel("PC 2"); ax1.set_ylabel("PC 3")
ax1.legend();

# PCA fig 8
lengths = np.sum(wordmat, axis=1)
for s in np.unique(sources['source']):
    plt.scatter(
        pcs[sources['source'] == s, 0],
        lengths[sources['source'] == s]
    )
plt.legend();
plt.xlabel("PC1"); plt.ylabel("passage length");

/tmp/ipykernel_123181/1265099608.py:8: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend();

loadings.sort_values("PC2").head(20)

loadings.sort_values("PC2").tail(20)

loadings.sort_values("PC3").head(20)

loadings.sort_values("PC3").tail(20)

	PC1	PC2	PC3
a	-0.200405	0.037541	-0.055637
aback	0.000536	0.000239	0.000244
abaft	0.000554	0.000167	0.000109
abandon	0.000514	0.000292	-0.000012
abandoned	0.000394	0.000976	0.000132
...	...	...	...
zephyr	0.000553	0.000176	0.000067
zodiac	0.000460	0.000522	-0.000393
zone	0.000406	0.000765	-0.000035
zones	0.000505	0.000423	-0.000012
zoroaster	0.000543	0.000196	0.000136

	PC1	PC2	PC3
her	-0.160492	-0.355002	0.462193
to	-0.335033	-0.311159	0.019167
i	-0.132836	-0.256586	-0.532524
she	-0.104468	-0.248379	0.239013
you	-0.076411	-0.150797	-0.331023
not	-0.098052	-0.131523	-0.062419
be	-0.093912	-0.131226	-0.075934
was	-0.141180	-0.130879	0.204603
had	-0.082233	-0.094667	0.110598
my	-0.047833	-0.089361	-0.194434
have	-0.060876	-0.078142	-0.118018
it	-0.130834	-0.071298	-0.134508
could	-0.038412	-0.070246	0.058351
for	-0.100339	-0.057560	-0.043023
your	-0.025251	-0.054792	-0.116548
very	-0.033219	-0.052090	-0.018889
elinor	-0.019032	-0.050489	0.023545
he	-0.101428	-0.047711	-0.022485
would	-0.038035	-0.047110	-0.013667
me	-0.030992	-0.046881	-0.137308

	PC1	PC2	PC3
white	-0.003848	0.019542	-0.005490
boat	-0.003445	0.020078	-0.001117
of	-0.347599	0.020880	0.089325
these	-0.010246	0.021112	-0.008600
now	-0.024838	0.022074	-0.015997
into	-0.018128	0.025168	-0.007119
one	-0.033698	0.027119	-0.037192
whales	-0.005390	0.027329	-0.010196
old	-0.008500	0.027787	-0.017462
his	-0.120435	0.028370	0.009677
ship	-0.005879	0.028600	-0.006917
sea	-0.006046	0.031019	-0.006752
ahab	-0.006510	0.032928	-0.007920
upon	-0.016007	0.035191	-0.028701
like	-0.014690	0.035441	-0.017307
a	-0.200405	0.037541	-0.055637
this	-0.047997	0.042451	-0.052062
in	-0.196969	0.053266	-0.005499
whale	-0.014152	0.069795	-0.026534
the	-0.523547	0.652591	0.022868

	PC1	PC2	PC3
i	-0.132836	-0.256586	-0.532524
you	-0.076411	-0.150797	-0.331023
my	-0.047833	-0.089361	-0.194434
is	-0.065981	0.015494	-0.170012
me	-0.030992	-0.046881	-0.137308
it	-0.130834	-0.071298	-0.134508
have	-0.060876	-0.078142	-0.118018
your	-0.025251	-0.054792	-0.116548
that	-0.142322	-0.020713	-0.103028
be	-0.093912	-0.131226	-0.075934
but	-0.081493	-0.014145	-0.075810
will	-0.025469	-0.030832	-0.075157
are	-0.023219	0.004333	-0.066721
if	-0.028036	-0.019286	-0.064901
not	-0.098052	-0.131523	-0.062419
am	-0.015088	-0.041852	-0.060722
do	-0.020977	-0.035387	-0.059673
can	-0.014108	-0.016091	-0.056266
a	-0.200405	0.037541	-0.055637
this	-0.047997	0.042451	-0.052062

	PC1	PC2	PC3
mother	-0.010312	-0.025910	0.021928
them	-0.037627	-0.026889	0.022182
the	-0.523547	0.652591	0.022868
elinor	-0.019032	-0.050489	0.023545
they	-0.045059	-0.018149	0.024460
every	-0.022388	-0.019543	0.028844
mrs	-0.012199	-0.028403	0.031719
marianne	-0.013762	-0.034782	0.033626
which	-0.049184	-0.029560	0.034382
by	-0.064756	-0.005060	0.034632
were	-0.042672	-0.005147	0.043909
herself	-0.016034	-0.036806	0.055770
could	-0.038412	-0.070246	0.058351
their	-0.041101	-0.003621	0.065769
of	-0.347599	0.020880	0.089325
and	-0.331271	-0.025959	0.091769
had	-0.082233	-0.094667	0.110598
was	-0.141180	-0.130879	0.204603
she	-0.104468	-0.248379	0.239013
her	-0.160492	-0.355002	0.462193

Principal Components Analysis (PCA)¶

When there's lots of variables¶

Manatee Morphometrics¶

Scree plot¶

Manatees in PC space¶

"PC1 is size"¶

How variables contribute to each PC¶

What's happening: the math¶

The Principal Components¶

Example: text analysis¶

Turn the data into a matrix¶

The loadings¶

The PCs¶

PC 1 is length¶

PC2 is book¶

PC3?¶