Take the sum of every N rows in a pandas series

The most efficient solution I can think of is f1() in my example below. It is orders of magnitude faster than using the groupby in the other answer. Note that f1() doesn't work when the length of the array is not an exact multiple, e.g. if you want to sum a 3-item array every 2 items. For those cases, you can use f1v2():

f1v2( [0,1,2,3,4] ,2 ) = [1,5,4]

My code is below. I have used timeit for the comparisons:

import timeit
import numpy as np
import pandas as pd


def f1(a,x):
    if isinstance(a, pd.Series):
        a = a.to_numpy()
    return a.reshape((int(a.shape[0]/x), int(x) )).sum(1)

def f2(myarray, x):
  return [sum(myarray[n: n+x]) for n in range(0, len(myarray), x)]

def f3(myarray, x):
    s = pd.Series(myarray)
    out = s.groupby(s.index // 2).sum()
    return out

def f1v2(a,x):
    if isinstance(a, pd.Series):
        a = a.to_numpy()
        
    mod = a.shape[0] % x
    if  mod != 0:
        excl = a[-mod:]
        keep = a[: len(a) - mod]
        out = keep.reshape((int(keep.shape[0]/x), int(x) )).sum(1)
        out = np.hstack( (excl.sum() , out) ) 
    else:       
        out = a.reshape((int(a.shape[0]/x), int(x) )).sum(1)
    
    return out
    

a = np.arange(0,1e6)

out1 = f1(a,2)
out2 = f2(a,2)
out3 = f2(a,2)

t1 = timeit.Timer( "f1(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t1v2 = timeit.Timer( "f1v2(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t2 = timeit.Timer( "f2(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t3 = timeit.Timer( "f3(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)

resdf = pd.DataFrame(index = ['min time'])
resdf['f1'] = [min(t1)]
resdf['f1v2'] = [min(t1v2)]
resdf['f2'] = [min(t2)]
resdf['f3'] = [min(t3)]
#the docs explain why it makes more sense to take the min than the avg
resdf = resdf.transpose()
resdf['% difference vs fastes'] = (resdf /resdf.min() - 1) * 100

b = np.array( [0,1,2,4,5,6,7] )

out1v2 = f1v2(b,2)

GroupBy.sum

N = 5
s.groupby(s.index // N).sum()
     
0     10
1     35
2     60
3     85
4    110
5    135
6    160
7    185
8    210
9    235
dtype: int64

Chunk the index into groups of 5 and group accordingly.


numpy.reshape + sum

If the size is a multiple of N (or 5), you can reshape and add:

s.values.reshape(-1, N).sum(1)
# array([ 10,  35,  60,  85, 110, 135, 160, 185, 210, 235])

numpy.add.at

b = np.zeros(len(s) // N)
np.add.at(b, s.index // N, s.values)
b
# array([ 10.,  35.,  60.,  85., 110., 135., 160., 185., 210., 235.])

Tags:

Python

Pandas