from collections import defaultdict
pfile = open("data/passages.txt", "r")
passages = pfile.read().split("\n")[:-1]
sources = pd.read_table("data/passage_sources.tsv")
words = np.unique(" ".join(passages).split(" "))[1:]
def tabwords(x, words):
d = defaultdict(int)
for w in x.split(" "):
d[w] += 1
out = np.array([d[w] for w in words])
return out
wordmat = np.array([tabwords(x, words) for x in passages])


