How to parse a pandas column of JSON content efficiently?
I see a small (~25%) performance improvement from bypassing pandas.concat
.
Otherwise, rewriting / optimizing json_normalize
doesn't seem straightforward.
def original(df):
parsed_df = pd.concat([json_normalize(json.loads(js)) for js in df['data']])
parsed_df['bank_account'] = df['bank_account'].values
parsed_df.index = parsed_df['uid']
return parsed_df
def jp(df):
cols = ['account_data.currency.current', 'account_data.currency.minimum',
'account_data.fees.monthly', 'account_type', 'uid', 'user_name']
parsed_df = pd.DataFrame([json_normalize(json.loads(js)).values[0] for js in df['data']],
columns=cols)
parsed_df['bank_account'] = df['bank_account'].values
parsed_df.index = parsed_df['uid']
return parsed_df
df = pd.concat([df]*100, ignore_index=True)
%timeit original(df) # 675 ms per loop
%timeit jp(df) # 526 ms per loop