n = 1000  # number of genes
num_samples = 20
M = rng.choice([0, 3], size=n)
D = rng.exponential(scale=2, size=n)
X = np.array([rng.normal(loc=M, scale=D, size=n) for _ in range(num_samples)])
mean_X = np.mean(X, axis=0)  # mean per gene
sd_X = np.std(X, axis=0)  # sd per gene

X.shape, mean_X.shape

((20, 1000), (1000,))

from matplotlib.collections import LineCollection
fig, ax = plt.subplots()

for Mval, col in [(0, 'black'), (3, 'red')]:
    ut = (M == Mval)
    segs = [[(d, m - 2 * s), (d, m + 2 * s)] for d, m, s in zip(D[ut], mean_X[ut], sd_X[ut])]
    ax.add_collection(LineCollection(segs, colors=col))
    ax.scatter(D[ut], mean_X[ut], c=col, label=f'true diff={Mval}')

ax.set_xlabel("true SD"); ax.set_ylabel("mean difference +/- 2 SD")
ax.legend();

import scipy.stats

tt = scipy.stats.ttest_1samp(X, axis=0, popmean=0)

fig, ax = plt.subplots()
for Mval, col in [(0, 'black'), (3, 'red')]:
    ut = (M == Mval)
    ax.scatter(mean_X[ut], tt.pvalue[ut], color=col, label=f'true diff={Mval}')
    
ax.set_xlabel("mean difference")
ax.set_ylabel("p-value")
ax.legend();

from scipy.stats import t
t_star = t.ppf(.975, df=num_samples - 2)

lower_ci = mean_X - t_star * sd_X / np.sqrt(num_samples)
upper_ci = mean_X + t_star * sd_X / np.sqrt(num_samples)
# put plot in order by observed mean diff
m_ord = np.argsort(np.argsort(mean_X))

fig, ax = plt.subplots()
ax.set_xlabel("dataset (ordered by observed difference)")
ax.set_ylabel("observed mean difference")
for Mval, col in [(0, 'black'), (3, 'red')]:
    ut = (M == Mval)
    lc = LineCollection([[(m, li), (m, ui)] for m, li, ui in zip(m_ord[ut], lower_ci[ut], upper_ci[ut])], color=col)
    ax.add_collection(lc)
    ax.scatter(m_ord[ut], mean_X[ut], c=col, label=f'true diff={Mval}')
    
ax.legend();

candidates = np.where(tt.pvalue < 0.05)[0]
true_pos = np.sum(M[candidates] > 0) / np.sum(M > 0)
false_pos = np.sum(M[candidates] == 0) / len(candidates)
print(f"Proportion of differentially expressed genes detected (power): {true_pos:.2f}")
print(f"Proportion of canddiate genes that are not differentially expressed (false positive rate): {false_pos:.2f}")

Proportion of differentially expressed genes detected (power): 0.95
Proportion of canddiate genes that are not differentially expressed (false positive rate): 0.06

Power and false positives¶

Power analysis¶

Simulate¶

Methods¶

The power analysis¶