Group by max or min in a numpy array
I'm fairly new to Python and Numpy but, it seems like you can use the .at
method of ufunc
s rather than reduceat
:
import numpy as np
data_id = np.array([0,0,0,1,1,1,1,2,2,2,3,3,3,4,5,5,5])
data_val = np.random.rand(len(data_id))
ans = np.empty(data_id[-1]+1) # might want to use max(data_id) and zeros instead
np.maximum.at(ans,data_id,data_val)
For example:
data_val = array([ 0.65753453, 0.84279716, 0.88189818, 0.18987882, 0.49800668,
0.29656994, 0.39542769, 0.43155428, 0.77982853, 0.44955868,
0.22080219, 0.4807312 , 0.9288989 , 0.10956681, 0.73215416,
0.33184318, 0.10936647])
ans = array([ 0.98969952, 0.84044947, 0.63460516, 0.92042078, 0.75738113,
0.37976055])
Of course this only makes sense if your data_id
values are suitable for use as indices (i.e. non-negative integers and not huge...presumably if they are large/sparse you could initialize ans
using np.unique(data_id)
or something).
I should point out that the data_id
doesn't actually need to be sorted.
In pure Python:
from itertools import groupby, imap, izip
from operator import itemgetter as ig
print [max(imap(ig(1), g)) for k, g in groupby(izip(id, data), key=ig(0))]
# -> [7, 10, 1]
A variation:
print [data[id==i].max() for i, _ in groupby(id)]
# -> [7, 10, 1]
Based on @Bago's answer:
import numpy as np
# sort by `id` then by `data`
ndx = np.lexsort(keys=(data, id))
id, data = id[ndx], data[ndx]
# get max()
print data[np.r_[np.diff(id), True].astype(np.bool)]
# -> [ 7 10 1]
If pandas
is installed:
from pandas import DataFrame
df = DataFrame(dict(id=id, data=data))
print df.groupby('id')['data'].max()
# id
# 1 7
# 2 10
# 3 1
I've been seeing some very similar questions on stack overflow the last few days. The following code is very similar to the implementation of numpy.unique and because it takes advantage of the underlying numpy machinery, it is most likely going to be faster than anything you can do in a python loop.
import numpy as np
def group_min(groups, data):
# sort with major key groups, minor key data
order = np.lexsort((data, groups))
groups = groups[order] # this is only needed if groups is unsorted
data = data[order]
# construct an index which marks borders between groups
index = np.empty(len(groups), 'bool')
index[0] = True
index[1:] = groups[1:] != groups[:-1]
return data[index]
#max is very similar
def group_max(groups, data):
order = np.lexsort((data, groups))
groups = groups[order] #this is only needed if groups is unsorted
data = data[order]
index = np.empty(len(groups), 'bool')
index[-1] = True
index[:-1] = groups[1:] != groups[:-1]
return data[index]