Take the sum of every N rows in a pandas series
The most efficient solution I can think of is f1()
in my example below. It is orders of magnitude faster than using the groupby
in the other answer.
Note that f1() doesn't work when the length of the array is not an exact multiple, e.g. if you want to sum a 3-item array every 2 items.
For those cases, you can use f1v2():
f1v2( [0,1,2,3,4] ,2 ) = [1,5,4]
My code is below. I have used timeit
for the comparisons:
import timeit
import numpy as np
import pandas as pd
def f1(a,x):
if isinstance(a, pd.Series):
a = a.to_numpy()
return a.reshape((int(a.shape[0]/x), int(x) )).sum(1)
def f2(myarray, x):
return [sum(myarray[n: n+x]) for n in range(0, len(myarray), x)]
def f3(myarray, x):
s = pd.Series(myarray)
out = s.groupby(s.index // 2).sum()
return out
def f1v2(a,x):
if isinstance(a, pd.Series):
a = a.to_numpy()
mod = a.shape[0] % x
if mod != 0:
excl = a[-mod:]
keep = a[: len(a) - mod]
out = keep.reshape((int(keep.shape[0]/x), int(x) )).sum(1)
out = np.hstack( (excl.sum() , out) )
else:
out = a.reshape((int(a.shape[0]/x), int(x) )).sum(1)
return out
a = np.arange(0,1e6)
out1 = f1(a,2)
out2 = f2(a,2)
out3 = f2(a,2)
t1 = timeit.Timer( "f1(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t1v2 = timeit.Timer( "f1v2(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t2 = timeit.Timer( "f2(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t3 = timeit.Timer( "f3(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
resdf = pd.DataFrame(index = ['min time'])
resdf['f1'] = [min(t1)]
resdf['f1v2'] = [min(t1v2)]
resdf['f2'] = [min(t2)]
resdf['f3'] = [min(t3)]
#the docs explain why it makes more sense to take the min than the avg
resdf = resdf.transpose()
resdf['% difference vs fastes'] = (resdf /resdf.min() - 1) * 100
b = np.array( [0,1,2,4,5,6,7] )
out1v2 = f1v2(b,2)
GroupBy.sum
N = 5
s.groupby(s.index // N).sum()
0 10
1 35
2 60
3 85
4 110
5 135
6 160
7 185
8 210
9 235
dtype: int64
Chunk the index into groups of 5 and group accordingly.
numpy.reshape
+ sum
If the size is a multiple of N (or 5), you can reshape and add:
s.values.reshape(-1, N).sum(1)
# array([ 10, 35, 60, 85, 110, 135, 160, 185, 210, 235])
numpy.add.at
b = np.zeros(len(s) // N)
np.add.at(b, s.index // N, s.values)
b
# array([ 10., 35., 60., 85., 110., 135., 160., 185., 210., 235.])