Co-occurrence matrix from nested list of words
You can also use matrix tricks in order to find the co-occurrence matrix too. Hope this works well when you have bigger vocabulary.
import scipy.sparse as sp
voc2id = dict(zip(names, range(len(names))))
rows, cols, vals = [], [], []
for r, d in enumerate(document):
for e in d:
if voc2id.get(e) is not None:
rows.append(r)
cols.append(voc2id[e])
vals.append(1)
X = sp.csr_matrix((vals, (rows, cols)))
Now, you can find coocurrence matrix by simple multiply X.T
with X
Xc = (X.T * X) # coocurrence matrix
Xc.setdiag(0)
print(Xc.toarray())
Obviously this can be extended for your purposes, but it performs the general operation in mind:
import math
for a in 'ABCD':
for b in 'ABCD':
count = 0
for x in document:
if a != b:
if a in x and b in x:
count += 1
else:
n = x.count(a)
if n >= 2:
count += math.factorial(n)/math.factorial(n - 2)/2
print '{} x {} = {}'.format(a, b, count)
from collections import OrderedDict
document = [['A', 'B'], ['C', 'B'], ['A', 'B', 'C', 'D']]
names = ['A', 'B', 'C', 'D']
occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)
# Find the co-occurrences:
for l in document:
for i in range(len(l)):
for item in l[:i] + l[i + 1:]:
occurrences[l[i]][item] += 1
# Print the matrix:
print(' ', ' '.join(occurrences.keys()))
for name, values in occurrences.items():
print(name, ' '.join(str(i) for i in values.values()))
Output;
A B C D
A 0 2 1 1
B 2 0 2 1
C 1 2 0 1
D 1 1 1 0
Another option is to use the constructor
csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
from scipy.sparse.csr_matrix where data
, row_ind
and col_ind
satisfy the
relationship a[row_ind[k], col_ind[k]] = data[k]
.
The trick is to generate row_ind
and col_ind
by iterating over the documents and creating a list of tuples (doc_id, word_id). data
would simply be a vector of ones of the same length.
Multiplying the docs-words matrix by its transpose would give you the co-occurences matrix.
Additionally, this is efficient in terms of both run times and memory usage, so it should also handle big corpuses.
import numpy as np
import itertools
from scipy.sparse import csr_matrix
def create_co_occurences_matrix(allowed_words, documents):
print(f"allowed_words:\n{allowed_words}")
print(f"documents:\n{documents}")
word_to_id = dict(zip(allowed_words, range(len(allowed_words))))
documents_as_ids = [np.sort([word_to_id[w] for w in doc if w in word_to_id]).astype('uint32') for doc in documents]
row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(documents_as_ids)]))
data = np.ones(len(row_ind), dtype='uint32') # use unsigned int for better memory utilization
max_word_id = max(itertools.chain(*documents_as_ids)) + 1
docs_words_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(documents_as_ids), max_word_id)) # efficient arithmetic operations with CSR * CSR
words_cooc_matrix = docs_words_matrix.T * docs_words_matrix # multiplying docs_words_matrix with its transpose matrix would generate the co-occurences matrix
words_cooc_matrix.setdiag(0)
print(f"words_cooc_matrix:\n{words_cooc_matrix.todense()}")
return words_cooc_matrix, word_to_id
Run example:
allowed_words = ['A', 'B', 'C', 'D']
documents = [['A', 'B'], ['C', 'B', 'K'],['A', 'B', 'C', 'D', 'Z']]
words_cooc_matrix, word_to_id = create_co_occurences_matrix(allowed_words, documents)
Output:
allowed_words:
['A', 'B', 'C', 'D']
documents:
[['A', 'B'], ['C', 'B', 'K'], ['A', 'B', 'C', 'D', 'Z']]
words_cooc_matrix:
[[0 2 1 1]
[2 0 2 1]
[1 2 0 1]
[1 1 1 0]]