Python built-in sum function vs. for loop performance
However if the loop is just adding 1 each iteration starting from 0 you could use the fast trick addition. The sum output should be 499999500000 for range(1000000)
import timeit
def sum1():
s = 0
for i in range(1000000):
s += i
#print s
return s
def sum2():
return sum(range(1000000))
def sum3():
s = range(1000000)
s = ((s[1]+s[-1])/2) * (len(s)-1)
#print(s)
return s
print 'For Loop Sum:', timeit.timeit(sum1, number=10)
print 'Built-in Sum:', timeit.timeit(sum2, number=10)
print 'Fast Sum:', timeit.timeit(sum3, number=10)
#prints
#For Loop Sum: 1.8420711
#Built-in Sum: 1.1081646
#Fast Sum: 0.3191561
You can see the source code in Python/bltinmodule.c
. It has special cases for int
s and float
s, but since the sum overflows to long
s pretty quickly, that probably doesn't have a major performance impact here. The general-case logic is pretty similar to what you'd write in Python, just in C. The speedup is most likely due to the fact that it doesn't have to go through all the bytecode interpreting and error handling overhead:
static PyObject*
builtin_sum(PyObject *self, PyObject *args)
{
PyObject *seq;
PyObject *result = NULL;
PyObject *temp, *item, *iter;
if (!PyArg_UnpackTuple(args, "sum", 1, 2, &seq, &result))
return NULL;
iter = PyObject_GetIter(seq);
if (iter == NULL)
return NULL;
if (result == NULL) {
result = PyInt_FromLong(0);
if (result == NULL) {
Py_DECREF(iter);
return NULL;
}
} else {
/* reject string values for 'start' parameter */
if (PyObject_TypeCheck(result, &PyBaseString_Type)) {
PyErr_SetString(PyExc_TypeError,
"sum() can't sum strings [use ''.join(seq) instead]");
Py_DECREF(iter);
return NULL;
}
Py_INCREF(result);
}
#ifndef SLOW_SUM
/* Fast addition by keeping temporary sums in C instead of new Python objects.
Assumes all inputs are the same type. If the assumption fails, default
to the more general routine.
*/
if (PyInt_CheckExact(result)) {
long i_result = PyInt_AS_LONG(result);
Py_DECREF(result);
result = NULL;
while(result == NULL) {
item = PyIter_Next(iter);
if (item == NULL) {
Py_DECREF(iter);
if (PyErr_Occurred())
return NULL;
return PyInt_FromLong(i_result);
}
if (PyInt_CheckExact(item)) {
long b = PyInt_AS_LONG(item);
long x = i_result + b;
if ((x^i_result) >= 0 || (x^b) >= 0) {
i_result = x;
Py_DECREF(item);
continue;
}
}
/* Either overflowed or is not an int. Restore real objects and process normally */
result = PyInt_FromLong(i_result);
temp = PyNumber_Add(result, item);
Py_DECREF(result);
Py_DECREF(item);
result = temp;
if (result == NULL) {
Py_DECREF(iter);
return NULL;
}
}
}
if (PyFloat_CheckExact(result)) {
double f_result = PyFloat_AS_DOUBLE(result);
Py_DECREF(result);
result = NULL;
while(result == NULL) {
item = PyIter_Next(iter);
if (item == NULL) {
Py_DECREF(iter);
if (PyErr_Occurred())
return NULL;
return PyFloat_FromDouble(f_result);
}
if (PyFloat_CheckExact(item)) {
PyFPE_START_PROTECT("add", Py_DECREF(item); Py_DECREF(iter); return 0)
f_result += PyFloat_AS_DOUBLE(item);
PyFPE_END_PROTECT(f_result)
Py_DECREF(item);
continue;
}
if (PyInt_CheckExact(item)) {
PyFPE_START_PROTECT("add", Py_DECREF(item); Py_DECREF(iter); return 0)
f_result += (double)PyInt_AS_LONG(item);
PyFPE_END_PROTECT(f_result)
Py_DECREF(item);
continue;
}
result = PyFloat_FromDouble(f_result);
temp = PyNumber_Add(result, item);
Py_DECREF(result);
Py_DECREF(item);
result = temp;
if (result == NULL) {
Py_DECREF(iter);
return NULL;
}
}
}
#endif
for(;;) {
item = PyIter_Next(iter);
if (item == NULL) {
/* error, or end-of-sequence */
if (PyErr_Occurred()) {
Py_DECREF(result);
result = NULL;
}
break;
}
/* It's tempting to use PyNumber_InPlaceAdd instead of
PyNumber_Add here, to avoid quadratic running time
when doing 'sum(list_of_lists, [])'. However, this
would produce a change in behaviour: a snippet like
empty = []
sum([[x] for x in range(10)], empty)
would change the value of empty. */
temp = PyNumber_Add(result, item);
Py_DECREF(result);
Py_DECREF(item);
result = temp;
if (result == NULL)
break;
}
Py_DECREF(iter);
return result;
}
As dwanderson
suggested, Numpy is one alternative. It is, indeed, if you want to do some maths. See this benchmark:
import numpy as np
r = range(1000000) # 12.5 ms
s = sum(r) # 7.9 ms
ar = np.arange(1000000) # 0.5 ms
as = np.sum(ar) # 0.6 ms
So both creating the list and summing it is much faster with numpy
. This is mostly because the numpy.array
is designed for this and is much more efficient than the list.
However, if we have a python list, then numpy
is very slow, as its conversion from a list into a numpy.array
is sluggish:
r = range(1000000)
ar = np.array(r) # 102 ms
The speed difference is actually greater than 3 times, but you slow down either version by first creating a huge in-memory list of 1 million integers. Separate that out of the time trials:
>>> import timeit
>>> def sum1(lst):
... s = 0
... for i in lst:
... s += i
... return s
...
>>> def sum2(lst):
... return sum(lst)
...
>>> values = range(1000000)
>>> timeit.timeit('f(lst)', 'from __main__ import sum1 as f, values as lst', number=100)
3.457869052886963
>>> timeit.timeit('f(lst)', 'from __main__ import sum2 as f, values as lst', number=100)
0.6696369647979736
The speed difference has risen to over 5 times now.
A for
loop is executed as interpreted Python bytecode. sum()
loops entirely in C code. The speed difference between interpreted bytecode and C code is large.
In addition, the C code makes sure not to create new Python objects if it can keep the sum in C types instead; this works for int
and float
results.
The Python version, disassembled, does this:
>>> import dis
>>> def sum1():
... s = 0
... for i in range(1000000):
... s += i
... return s
...
>>> dis.dis(sum1)
2 0 LOAD_CONST 1 (0)
3 STORE_FAST 0 (s)
3 6 SETUP_LOOP 30 (to 39)
9 LOAD_GLOBAL 0 (range)
12 LOAD_CONST 2 (1000000)
15 CALL_FUNCTION 1
18 GET_ITER
>> 19 FOR_ITER 16 (to 38)
22 STORE_FAST 1 (i)
4 25 LOAD_FAST 0 (s)
28 LOAD_FAST 1 (i)
31 INPLACE_ADD
32 STORE_FAST 0 (s)
35 JUMP_ABSOLUTE 19
>> 38 POP_BLOCK
5 >> 39 LOAD_FAST 0 (s)
42 RETURN_VALUE
Apart from the interpreter loop being slower than C, the INPLACE_ADD
will create a new integer object (past 255, CPython caches small int
objects as singletons).
You can see the C implementation in the Python mercurial code repository, but it explicitly states in the comments:
/* Fast addition by keeping temporary sums in C instead of new Python objects.
Assumes all inputs are the same type. If the assumption fails, default
to the more general routine.
*/