How to find the correlation between a group of values in a pandas dataframe column
To get your desired output format you could use .corrwith
:
corrs = (df[['Var1', 'ID']]
.groupby('ID')
.corrwith(df.Var2)
.rename(columns={'Var1' : 'Corr_Coef'}))
print(corrs)
Corr_Coef
ID
1 0.98198
2 0.97073
Generalized solution:
import numpy as np
def groupby_coef(df, col1, col2, on_index=True, squeeze=True, name='coef',
keys=None, **kwargs):
"""Grouped correlation coefficient between two columns
Flat result structure in contrast to `groupby.corr()`.
Parameters
==========
df : DataFrame
col1 & col2: str
Columns for which to calculate correlation coefs
on_index : bool, default True
Specify whether you're grouping on index
squeeze : bool, default True
True -> Series; False -> DataFrame
name : str, default 'coef'
Name of DataFrame column if squeeze == True
keys : column label or list of column labels / arrays
Passed to `pd.DataFrame.set_index`
**kwargs :
Passed to `pd.DataFrame.groupby`
"""
# If we are grouping on something other than the index, then
# set as index first to avoid hierarchical result.
# Kludgy, but safer than trying to infer.
if not on_index:
df = df.set_index(keys=keys)
if not kwargs:
# Assume we're grouping on 0th level of index
kwargs = {'level': 0}
grouped = df[[col1]].groupby(**kwargs)
res = grouped.corrwith(df[col2])
res.columns = [name]
if squeeze:
res = np.squeeze(res)
return res
Examples:
df_1 = pd.DataFrame(np.random.randn(10, 2),
index=[1]*5 + [2]*5).add_prefix('var')
df_2 = df_1.reset_index().rename(columns={'index': 'var2'})
print(groupby_coef(df_1, 'var0', 'var1', level=0))
1 7.424e-18
2 -9.481e-19
Name: coef, dtype: float64
print(groupby_coef(df_2, col1='var0', col2='var1',
on_index=False, keys='var2'))
var2
1 7.424e-18
2 -9.481e-19
Name: coef, dtype: float64
df.groupby('ID').corr()
Output:
Var1 Var2
ID
1 Var1 1.000000 0.981981
Var2 0.981981 1.000000
2 Var1 1.000000 0.970725
Var2 0.970725 1.000000
With OP output formating.
df_out = df.groupby('ID').corr()
(df_out[~df_out['Var1'].eq(1)]
.reset_index(1, drop=True)['Var1']
.rename('Corr_Coef')
.reset_index())
Output:
ID Corr_Coef
0 1 0.981981
1 2 0.970725