Length of each string in a NumPy array
You can use vectorize
of numpy
. It is much faster.
mylen = np.vectorize(len)
print mylen(arr)
UPDATE 06/20: Cater for u+0000 character and non-contiguous inputs - thanks @ M1L0U
Here is a comparison of a couple of methods.
Observations:
- For input size >1000 lines, viewcasting +
argmax
is consistently and by a large margin fastest. - Python solutions profit from converting the array to a list first.
map
beats list comprehensionnp.frompyfunc
and to a lesser degreenp.vectorize
fare better than their reputation
.
contiguous
method ↓↓ size →→ | 10| 100| 1000| 10000| 100000|1000000
------------------------------------+-------+-------+-------+-------+-------+-------
np.char.str_len | 0.006| 0.037| 0.350| 3.566| 34.781|345.803
list comprehension | 0.005| 0.036| 0.312| 2.970| 28.783|293.715
list comprehension after .tolist() | 0.002| 0.011| 0.117| 1.119| 12.863|133.886
map | 0.002| 0.008| 0.080| 0.745| 9.374|103.749
np.frompyfunc | 0.004| 0.011| 0.089| 0.861| 8.824| 88.739
np.vectorize | 0.025| 0.032| 0.132| 1.046| 12.112|133.863
safe argmax | 0.026| 0.026| 0.056| 0.290| 2.827| 32.583
non-contiguous
method ↓↓ size →→ | 10| 100| 1000| 10000| 100000|1000000
------------------------------------+-------+-------+-------+-------+-------+-------
np.char.str_len | 0.006| 0.037| 0.349| 3.575| 34.525|344.859
list comprehension | 0.005| 0.032| 0.306| 2.963| 29.445|292.527
list comprehension after .tolist() | 0.002| 0.011| 0.117| 1.043| 11.081|130.644
map | 0.002| 0.008| 0.081| 0.731| 7.967| 99.848
np.frompyfunc | 0.005| 0.012| 0.099| 0.885| 9.221| 92.700
np.vectorize | 0.025| 0.033| 0.146| 1.063| 11.844|134.505
safe argmax | 0.026| 0.026| 0.057| 0.291| 2.997| 31.161
Code:
import numpy as np
flist = []
def timeme(name):
def wrap_gen(f):
flist.append((name, f))
return(f)
return wrap_gen
@timeme("np.char.str_len")
def np_char():
return np.char.str_len(A)
@timeme("list comprehension")
def lst_cmp():
return [len(a) for a in A]
@timeme("list comprehension after .tolist()")
def lst_cmp_opt():
return [len(a) for a in A.tolist()]
@timeme("map")
def map_():
return list(map(len, A.tolist()))
@timeme("np.frompyfunc")
def np_fpf():
return np.frompyfunc(len, 1, 1)(A)
@timeme("np.vectorize")
def np_vect():
return np.vectorize(len)(A)
@timeme("safe argmax")
def np_safe():
assert A.dtype.kind=="U"
# work around numpy's refusal to viewcast non contiguous arrays
v = np.lib.stride_tricks.as_strided(
A[0,None].view("u4"),(A.size,A.itemsize>>2),(A.strides[0],4))
v = v[:,::-1].astype(bool)
l = v.argmax(1)
empty = (~(v[:,0]|l.astype(bool))).nonzero()
l = v.shape[1]-l
l[empty] = 0
return l
A = np.random.choice(
"Blind\x00text do not use the quick brown fox jumps over the lazy dog "
.split(" "),1000000)[::2]
for _, f in flist[:-1]:
assert (f()==flist[-1][1]()).all()
from timeit import timeit
for j,tag in [(1,"contiguous"),(2,"non-contiguous")]:
print('\n',tag)
L = ['|+' + len(flist)*'|',
[f"{'method ↓↓ size →→':36s}", 36*'-']
+ [f"{name:36s}" for name, f in flist]]
for N in (10, 100, 1000, 10000, 100000, 1000000):
A = np.random.choice("Blind\x00text do not use the quick brown fox"
" jumps over the lazy dog ".split(" "),j*N)[::j]
L.append([f"{N:>7d}", 7*'-']
+ [f"{timeit(f, number=10)*100:7.3f}" for name, f in flist])
for sep, *line in zip(*L):
print(*line, sep=sep)