How to get a list of all indices of repeated elements in a numpy array
I've found that not using np.unique
, and instead using np.diff
is significantly faster and handles non-sorted initial arrays much better.
To show this, I ran @Trenton McKinney's benchmark for a couple of the trial numbers (2 million and 20k) to show that the diff solution floors the others. It also does not require a sorted array or sorting the array, which is a significant benefit.
Here is the function:
def find_repeats(arr: np.ndarray) -> np.ndarray:
"""Find indices of repeat values in an array.
Args:
arr (np.ndarray): An array to find repeat values in.
Returns:
np.ndarray: An array of indices into arr which are the values which
repeat.
"""
arr_diff = np.diff(arr, append=[arr[-1] + 1])
res_mask = arr_diff == 0
arr_diff_zero_right = np.nonzero(res_mask)[0] + 1
res_mask[arr_diff_zero_right] = True
return np.nonzero(res_mask)[0]
2 Million Elements
20k Elements
Full Test Code
import random
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict
import time
def find_repeats(arr: np.ndarray) -> np.ndarray:
"""Find indices of repeat values in an array.
Args:
arr (np.ndarray): An array to find repeat values in.
Returns:
np.ndarray: An array of indices into arr which are the values which
repeat.
"""
arr_diff = np.diff(arr, append=[arr[-1] + 1])
res_mask = arr_diff == 0
arr_diff_zero_right = np.nonzero(res_mask)[0] + 1
res_mask[arr_diff_zero_right] = True
return np.nonzero(res_mask)[0]
def dd(l):
# default_dict test
indices = defaultdict(list)
for i, v in enumerate(l):
indices[v].append(i)
return indices
def npw(l):
# np_where test
return {v: np.where(l == v)[0] for v in np.unique(l)}
def uni(records_array):
# np_unique test
idx_sort = np.argsort(records_array)
sorted_records_array = records_array[idx_sort]
vals, idx_start, count = np.unique(
sorted_records_array, return_counts=True, return_index=True)
res = np.split(idx_sort, idx_start[1:])
return dict(zip(vals, res))
def daf(l):
# pandas test
return pd.DataFrame(l).groupby([0]).indices
data = defaultdict(list)
for x in range(4, 20000, 1000): # number of unique elements
print(f"{x} trial done")
# create 2M element list
random.seed(365)
a = np.array([random.choice(range(x)) for _ in range(2000000)])
num_runs = 2
t0 = time.time()
for i in range(num_runs):
dd(a)
res1 = time.time() - t0
t0 = time.time()
for i in range(num_runs):
uni(a)
res3 = time.time() - t0
t0 = time.time()
for i in range(num_runs):
daf(a)
res4 = time.time() - t0
t0 = time.time()
for i in range(num_runs):
find_repeats(a)
res5 = time.time() - t0
data['defaut_dict'].append(res1 / num_runs)
data['np_unique'].append(res3 / num_runs)
data['pandas'].append(res4 / num_runs)
data['np_diff'].append(res5 / num_runs)
data['idx'].append(x)
df = pd.DataFrame(data)
df.set_index('idx', inplace=True)
df.plot(figsize=(12, 5), xlabel='unique samples',
ylabel='average time (s)', title='%timeit test: 2 run 1 loop each')
plt.legend(bbox_to_anchor=(1.0, 1), loc='upper left')
plt.show()
A vectorized solution with numpy, on the magic of unique()
.
import numpy as np
# create a test array
records_array = np.array([1, 2, 3, 1, 1, 3, 4, 3, 2])
# creates an array of indices, sorted by unique element
idx_sort = np.argsort(records_array)
# sorts records array so all unique elements are together
sorted_records_array = records_array[idx_sort]
# returns the unique values, the index of the first occurrence of a value, and the count for each element
vals, idx_start, count = np.unique(sorted_records_array, return_counts=True, return_index=True)
# splits the indices into separate arrays
res = np.split(idx_sort, idx_start[1:])
#filter them with respect to their size, keeping only items occurring more than once
vals = vals[count > 1]
res = filter(lambda x: x.size > 1, res)
The following code was the original answer, which required a bit more memory, using numpy
broadcasting and calling unique
twice:
records_array = array([1, 2, 3, 1, 1, 3, 4, 3, 2])
vals, inverse, count = unique(records_array, return_inverse=True,
return_counts=True)
idx_vals_repeated = where(count > 1)[0]
vals_repeated = vals[idx_vals_repeated]
rows, cols = where(inverse == idx_vals_repeated[:, newaxis])
_, inverse_rows = unique(rows, return_index=True)
res = split(cols, inverse_rows[1:])
with as expected res = [array([0, 3, 4]), array([1, 8]), array([2, 5, 7])]
- The answer is complicated, and dependent upon the size, and number of unique elements in the array.
- The following:
- Tests arrays with 2M elements, and up to 20k unique elements.
- Tests arrays up to 80k elements, with a max of 20k unique elements
- For arrays under 40k elements, the tests have up to half the unique elements as the size of the array (e.g. 10k elements would have up to 5k unique elements).
Arrays with 2M Elements
np.where
is faster thandefaultdict
for up to about 200 unique elements, but slower thanpandas.core.groupby.GroupBy.indices
, andnp.unique
.- The solution using
pandas
, is the fastest solution for large arrays.
Arrays with up to 80k Elements
- This is more situational, depending on the size of the array and the number of unique elements.
defaultdict
is a fast option for arrays to about 2400 elements, especially with a large number of unique elements.- For arrays larger than 40k elements, and 20k unique elements, pandas is the fastest option.
%timeit
import random
import numpy
import pandas as pd
from collections import defaultdict
def dd(l):
# default_dict test
indices = defaultdict(list)
for i, v in enumerate(l):
indices[v].append(i)
return indices
def npw(l):
# np_where test
return {v: np.where(l == v)[0] for v in np.unique(l)}
def uni(records_array):
# np_unique test
idx_sort = np.argsort(records_array)
sorted_records_array = records_array[idx_sort]
vals, idx_start, count = np.unique(sorted_records_array, return_counts=True, return_index=True)
res = np.split(idx_sort, idx_start[1:])
return dict(zip(vals, res))
def daf(l):
# pandas test
return pd.DataFrame(l).groupby([0]).indices
data = defaultdict(list)
for x in range(4, 20000, 100): # number of unique elements
# create 2M element list
random.seed(365)
a = np.array([random.choice(range(x)) for _ in range(2000000)])
res1 = %timeit -r2 -n1 -q -o dd(a)
res2 = %timeit -r2 -n1 -q -o npw(a)
res3 = %timeit -r2 -n1 -q -o uni(a)
res4 = %timeit -r2 -n1 -q -o daf(a)
data['defaut_dict'].append(res1.average)
data['np_where'].append(res2.average)
data['np_unique'].append(res3.average)
data['pandas'].append(res4.average)
data['idx'].append(x)
df = pd.DataFrame(data)
df.set_index('idx', inplace=True)
df.plot(figsize=(12, 5), xlabel='unique samples', ylabel='average time (s)', title='%timeit test: 2 run 1 loop each')
plt.legend(bbox_to_anchor=(1.0, 1), loc='upper left')
plt.show()