Pandas: Drop consecutive duplicates
Here is a function that handles both pd.Series
and pd.Dataframes
. You can mask/drop, choose the axis and finally choose to drop with 'any' or 'all' 'NaN'. It is not optimized in term of computation time, but it has the advantage to be robust and pretty clear.
import numpy as np
import pandas as pd
# To mask/drop successive values in pandas
def Mask_Or_Drop_Successive_Identical_Values(df, drop=False,
keep_first=True,
axis=0, how='all'):
'''
#Function built with the help of:
# 1) https://stackoverflow.com/questions/48428173/how-to-change-consecutive-repeating-values-in-pandas-dataframe-series-to-nan-or
# 2) https://stackoverflow.com/questions/19463985/pandas-drop-consecutive-duplicates
Input:
df should be a pandas.DataFrame of a a pandas.Series
Output:
df of ts with masked or dropped values
'''
# Mask keeping the first occurrence
if keep_first:
df = df.mask(df.shift(1) == df)
# Mask including the first occurrence
else:
df = df.mask((df.shift(1) == df) | (df.shift(-1) == df))
# Drop the values (e.g. rows are deleted)
if drop:
return df.dropna(axis=axis, how=how)
# Only mask the values (e.g. become 'NaN')
else:
return df
Here is a test code to include in the script:
if __name__ == "__main__":
# With time series
print("With time series:\n")
ts = pd.Series([1,1,2,2,3,2,6,6,float('nan'), 6,6,float('nan'),float('nan')],
index=[0,1,2,3,4,5,6,7,8,9,10,11,12])
print("#Original ts:")
print(ts)
print("\n## 1) Mask keeping the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(ts, drop=False,
keep_first=True))
print("\n## 2) Mask including the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(ts, drop=False,
keep_first=False))
print("\n## 3) Drop keeping the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(ts, drop=True,
keep_first=True))
print("\n## 4) Drop including the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(ts, drop=True,
keep_first=False))
# With dataframes
print("With dataframe:\n")
df = pd.DataFrame(np.random.randn(15, 3))
df.iloc[4:9,0]=40
df.iloc[8:15,1]=22
df.iloc[8:12,2]=0.23
print("#Original df:")
print(df)
print("\n## 5) Mask keeping the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(df, drop=False,
keep_first=True))
print("\n## 6) Mask including the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(df, drop=False,
keep_first=False))
print("\n## 7) Drop 'any' keeping the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(df, drop=True,
keep_first=True,
how='any'))
print("\n## 8) Drop 'all' keeping the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(df, drop=True,
keep_first=True,
how='all'))
print("\n## 9) Drop 'any' including the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(df, drop=True,
keep_first=False,
how='any'))
print("\n## 10) Drop 'all' including the first occurrence:")
print(Mask_Or_Drop_Successive_Identical_Values(df, drop=True,
keep_first=False,
how='all'))
And here is the expected result:
With time series:
#Original ts:
0 1.0
1 1.0
2 2.0
3 2.0
4 3.0
5 2.0
6 6.0
7 6.0
8 NaN
9 6.0
10 6.0
11 NaN
12 NaN
dtype: float64
## 1) Mask keeping the first occurrence:
0 1.0
1 NaN
2 2.0
3 NaN
4 3.0
5 2.0
6 6.0
7 NaN
8 NaN
9 6.0
10 NaN
11 NaN
12 NaN
dtype: float64
## 2) Mask including the first occurrence:
0 NaN
1 NaN
2 NaN
3 NaN
4 3.0
5 2.0
6 NaN
7 NaN
8 NaN
9 NaN
10 NaN
11 NaN
12 NaN
dtype: float64
## 3) Drop keeping the first occurrence:
0 1.0
2 2.0
4 3.0
5 2.0
6 6.0
9 6.0
dtype: float64
## 4) Drop including the first occurrence:
4 3.0
5 2.0
dtype: float64
With dataframe:
#Original df:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
4 40.000000 1.889781 -1.394573
5 40.000000 -0.470958 -0.339213
6 40.000000 1.613524 0.271641
7 40.000000 -1.810958 -1.568372
8 40.000000 22.000000 0.230000
9 -0.296557 22.000000 0.230000
10 -0.921238 22.000000 0.230000
11 -0.170195 22.000000 0.230000
12 1.460457 22.000000 -0.295418
13 0.307825 22.000000 -0.759131
14 0.287392 22.000000 0.378315
## 5) Mask keeping the first occurrence:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
4 40.000000 1.889781 -1.394573
5 NaN -0.470958 -0.339213
6 NaN 1.613524 0.271641
7 NaN -1.810958 -1.568372
8 NaN 22.000000 0.230000
9 -0.296557 NaN NaN
10 -0.921238 NaN NaN
11 -0.170195 NaN NaN
12 1.460457 NaN -0.295418
13 0.307825 NaN -0.759131
14 0.287392 NaN 0.378315
## 6) Mask including the first occurrence:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
4 NaN 1.889781 -1.394573
5 NaN -0.470958 -0.339213
6 NaN 1.613524 0.271641
7 NaN -1.810958 -1.568372
8 NaN NaN NaN
9 -0.296557 NaN NaN
10 -0.921238 NaN NaN
11 -0.170195 NaN NaN
12 1.460457 NaN -0.295418
13 0.307825 NaN -0.759131
14 0.287392 NaN 0.378315
## 7) Drop 'any' keeping the first occurrence:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
4 40.000000 1.889781 -1.394573
## 8) Drop 'all' keeping the first occurrence:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
4 40.000000 1.889781 -1.394573
5 NaN -0.470958 -0.339213
6 NaN 1.613524 0.271641
7 NaN -1.810958 -1.568372
8 NaN 22.000000 0.230000
9 -0.296557 NaN NaN
10 -0.921238 NaN NaN
11 -0.170195 NaN NaN
12 1.460457 NaN -0.295418
13 0.307825 NaN -0.759131
14 0.287392 NaN 0.378315
## 9) Drop 'any' including the first occurrence:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
## 10) Drop 'all' including the first occurrence:
0 1 2
0 -1.890137 -3.125224 -1.029065
1 -0.224712 -0.194742 1.891365
2 1.009388 0.589445 0.927405
3 0.212746 -0.392314 -0.781851
4 NaN 1.889781 -1.394573
5 NaN -0.470958 -0.339213
6 NaN 1.613524 0.271641
7 NaN -1.810958 -1.568372
9 -0.296557 NaN NaN
10 -0.921238 NaN NaN
11 -0.170195 NaN NaN
12 1.460457 NaN -0.295418
13 0.307825 NaN -0.759131
14 0.287392 NaN 0.378315
Since we are going for most efficient way
, i.e. performance, let's use array data to leverage NumPy. We will slice one-off slices and compare, similar to shifting method discussed earlier in @EdChum's post
. But with NumPy slicing we would end up with one-less array, so we need to concatenate with a True
element at the start to select the first element and hence we would have an implementation like so -
def drop_consecutive_duplicates(a):
ar = a.values
return a[np.concatenate(([True],ar[:-1]!= ar[1:]))]
Sample run -
In [149]: a
Out[149]:
1 1
2 2
3 2
4 3
5 2
dtype: int64
In [150]: drop_consecutive_duplicates(a)
Out[150]:
1 1
2 2
4 3
5 2
dtype: int64
Timings on large arrays comparing @EdChum's solution
-
In [142]: a = pd.Series(np.random.randint(1,5,(1000000)))
In [143]: %timeit a.loc[a.shift() != a]
100 loops, best of 3: 12.1 ms per loop
In [144]: %timeit drop_consecutive_duplicates(a)
100 loops, best of 3: 11 ms per loop
In [145]: a = pd.Series(np.random.randint(1,5,(10000000)))
In [146]: %timeit a.loc[a.shift() != a]
10 loops, best of 3: 136 ms per loop
In [147]: %timeit drop_consecutive_duplicates(a)
10 loops, best of 3: 114 ms per loop
So, there's some improvement!
Get major boost for values only!
If only the values are needed, we could get major boost by simply indexing into the array data, like so -
def drop_consecutive_duplicates(a):
ar = a.values
return ar[np.concatenate(([True],ar[:-1]!= ar[1:]))]
Sample run -
In [170]: a = pandas.Series([1,2,2,3,2], index=[1,2,3,4,5])
In [171]: drop_consecutive_duplicates(a)
Out[171]: array([1, 2, 3, 2])
Timings -
In [173]: a = pd.Series(np.random.randint(1,5,(10000000)))
In [174]: %timeit a.loc[a.shift() != a]
10 loops, best of 3: 137 ms per loop
In [175]: %timeit drop_consecutive_duplicates(a)
10 loops, best of 3: 61.3 ms per loop
Use shift
:
a.loc[a.shift(-1) != a]
Out[3]:
1 1
3 2
4 3
5 2
dtype: int64
So the above uses boolean critieria, we compare the dataframe against the dataframe shifted by -1 rows to create the mask
Another method is to use diff
:
In [82]:
a.loc[a.diff() != 0]
Out[82]:
1 1
2 2
4 3
5 2
dtype: int64
But this is slower than the original method if you have a large number of rows.
Update
Thanks to Bjarke Ebert for pointing out a subtle error, I should actually use shift(1)
or just shift()
as the default is a period of 1, this returns the first consecutive value:
In [87]:
a.loc[a.shift() != a]
Out[87]:
1 1
2 2
4 3
5 2
dtype: int64
Note the difference in index values, thanks @BjarkeEbert!
Here is an update that will make it work with multiple columns. Use ".any(axis=1)" to combine the results from each column:
cols = ["col1","col2","col3"]
de_dup = a[cols].loc[(a[cols].shift() != a[cols]).any(axis=1)]