How to get the p-value between two groups after groupby in pandas?
You can do this:
import numpy as np
import pandas as pd
import scipy.stats as stats
def get_ttest(x,y,sided=1):
return stats.ttest_ind(x, y, equal_var=False).pvalue/sided
np.random.seed(100)
N = 15
df = pd.DataFrame({'country': np.random.choice(['A','B','C'],N),
'test': np.random.choice([0,1], N),
'conversion': np.random.choice([0,1], N),
'sex': np.random.choice(['M','F'], N)
})
col_groupby = 'country'
col_test_control = 'test'
col_effect = 'conversion'
a,b = df[col_test_control].unique()
df_pval = df.groupby([col_groupby,col_test_control])\
[col_effect].agg(['size','mean']).unstack(col_test_control)
df_pval.columns = [f'group{a}_size',f'group{b}_size',
f'group{a}_mean',f'group{b}_mean']
df_pval['pvalue'] = df.groupby(col_groupby).apply(lambda dfx: get_ttest(
dfx.loc[dfx[col_test_control] == a, col_effect],
dfx.loc[dfx[col_test_control] == b, col_effect]))
df_pval.pipe(print)
Result
test_size control_size test_mean control_mean pvalue
country
A 3 3 0.666667 0.666667 1.000000
B 1 1 1.000000 1.000000 NaN
C 4 3 0.750000 1.000000 0.391002
Test the result
# test for country C
c0 = df.loc[(df.country=='C') & (df.test==0),'conversion']
c1 = df.loc[(df.country=='C') & (df.test==1),'conversion']
pval = stats.ttest_ind(c0, c1, equal_var=False).pvalue
print(pval) # 0.39100221895577053
pivot
could be used to get the required transformation of the data.
def f(group):
pvt_table = group.pivot(columns='test', values='conversion')
return(stats.ttest_ind(pvt_table[0], pvt_table[1],
equal_var=False, nan_policy='omit').pvalue)
grouped = df.groupby(['country'])['test','conversion']
grouped.apply(f)
#country
#A 1
#B --
#C 0.391002
#dtype: object