Build a basic cube with numpy?
For the data-structure you could define the following class:
class Cube:
def __init__(self, row_index, col_index, data):
self.row_index = {r: i for i, r in enumerate(row_index)}
self.col_index = {c: i for i, c in enumerate(col_index)}
self.data = data
def __getitem__(self, item):
row, col = item
return self.data[self.row_index[row] , self.col_index[col]]
def __repr__(self):
return repr(self.data)
Basically a light wrapper around a two-dimensional numpy array. For computing the cross tabulation you could do something like this:
def _x_tab(rows, columns, values):
"""Function for computing the cross-tab of simple arrays"""
unique_values_all_cols, idx = zip(*(np.unique(col, return_inverse=True) for col in [rows, columns]))
shape_xt = [uniq_vals_col.size for uniq_vals_col in unique_values_all_cols]
xt = np.zeros(shape_xt, dtype=np.float)
np.add.at(xt, idx, values)
return unique_values_all_cols, xt
def make_index(a, r):
"""Make array of tuples"""
l = [tuple(row) for row in a[:, r]]
return make_object_array(l)
def make_object_array(l):
a = np.empty(len(l), dtype=object)
a[:] = l
return a
def fill_label(ar, le):
"""Fill missing parts with ALL label"""
missing = tuple(["ALL"] * le)
return [(e + missing)[:le] for e in ar]
def x_tab(rows, cols, values):
"""Main function for cross tabulation"""
_, l_cols = rows.shape
total_agg = []
total_idx = []
for i in range(l_cols + 1):
(idx, _), agg = _x_tab(make_index(rows, list(range(i))), cols, values)
total_idx.extend(fill_label(idx, l_cols))
total_agg.append(agg)
stacked_agg = np.vstack(total_agg)
stacked_agg_total = stacked_agg.sum(axis=1).reshape(-1, 1)
return Cube(total_idx, list(dict.fromkeys(cols)), np.concatenate((stacked_agg, stacked_agg_total), axis=1))
Suppose as input an arr
array:
[['Shakespeare' 'Hamlet' 2000 104.2]
['Shakespeare' 'Hamlet' 2001 99.0]
['Shakespeare' 'Romeo' 2000 27.0]
['Shakespeare' 'Romeo' 2001 19.0]
['Dante' 'Inferno' 2000 11.6]
['Dante' 'Inferno' 2001 12.6]]
Then x_tab
can be called like this:
result = x_tab(arr[:, [0, 1]], arr[:, 2], arr[:, 3])
print(result)
Output
array([[142.8, 130.6, 273.4],
[ 11.6, 12.6, 24.2],
[131.2, 118. , 249.2],
[ 11.6, 12.6, 24.2],
[104.2, 99. , 203.2],
[ 27. , 19. , 46. ]])
Note that this representation (repr) is just for the purpose of showing the results, you can change it at you see fit. Then you can access the cells of the cube as follows:
print(result[('Dante', 'ALL'), 2001])
print(result[('Dante', 'Inferno'), 2001])
print(result[('Shakespeare', 'Hamlet'), 2000])
Output
12.6
12.6
104.2
Notice that the bulk of the operations are in the _x_tab function, which uses pure numpy functions. At the same time it provides a flexible interface for any aggregation function you choose, just change the ufunc at this line:
np.add.at(xt, idx, values)
by any other from this list. For more information see the documentation on the at operator.
A working copy of the code can be found here. The above is based on this gist.
Note This assumes you are passing multiple columns for the index (rows parameter).
I think numpy
record arrays can be used for this task, below is my solution based on record arrays.
class rec_array():
def __init__(self,author=None,book=None,year=None,sales=None):
self.dtype = [('author','<U20'), ('book','<U20'),('year','<U20'),('sales',float)]
self.rec_array = np.rec.fromarrays((author,book,year,sales),dtype=self.dtype)
def add_record(self,author,book,year,sales):
new_rec = np.rec.fromarrays((author,book,year,sales),dtype=self.dtype)
if not self.rec_array.shape == ():
self.rec_array = np.hstack((self.rec_array,new_rec))
else:
self.rec_array = new_rec
def get_view(self,conditions):
"""
conditions:
A list of conditions, for example
[["author",<,"Shakespeare"],["year","<=","2000"]]
"""
mask = np.ones(self.rec_array.shape[0]).astype(bool)
for item in conditions:
field,op,target = item
field_op = "self.rec_array['%s'] %s '%s'" % (field,op,target)
mask &= eval(field_op)
selected_sales = self.rec_array['sales'][mask]
return np.sum(selected_sales)
Based on this rec_array
, given the data
author = 4*["Shakespeare"]+ 2*["Dante"]
book = 2*["Hamlet"] + 2*["Romeo"] + 2*["Inferno"]
year = 3*["2000", "2001"]
sales = [104.2, 99.0, 27.0, 19.0, 11.6, 12.6]
we create an instance
test = rec_array()
test.add_record(author,book,year,sales)
If, for example, you want the sales of Shakespeare's Romeo, you can simply do this
test.get_view([["author","==","Shakespeare"],["book","==","Romeo"]])
the output is 46.0
or, you can also do
test.get_view([["author","==","Shakespeare"],["year","<=","2000"]])
the output is 131.2