python missing data code example

Example 1: replace missing values, encoded as np.nan, using the mean value of the columns

# Univariate feature imputation

import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
# SimpleImputer()
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))
# [[4.          2.        ]
#  [6.          3.666...]
#  [7.          6.        ]]

# SimpleImputer class also supports categorical data

import pandas as pd
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["a", np.nan],
                   ["b", "y"]], dtype="category")

imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df))
# [['a' 'x']
#  ['a' 'y']
#  ['a' 'y']
#  ['b' 'y']]

Example 2: whow i fill the data if most values are nan in jupyter notebook

# import pandas
import pandas as pd

# make a sample data
list_of_rows = [
  {'start_station': 1, 'end_station': 1},
  {'start_station': None, 'end_station': 1},
  {'start_station': 1, 'end_station': 2},
  {'start_station': 1, 'end_station': 3},
  {'start_station': 2, 'end_station': None},
  {'start_station': 2, 'end_station': 3},
  {'start_station': 2, 'end_station': 3},
]

# make a pandas data frame
df = pd.DataFrame(list_of_rows)

# define a function
def fill_NaNs_in_end_station(row):
    if pd.isnull(row['end_station']):
        start_station = row['start_station']
        return df[df['start_station']==start_station].end_station.value_counts().first_valid_index()
    return row['end_station']

# apply function to dataframe
df['end_station'] = df.apply(lambda row: fill_NaNs_in_end_station(row), axis=1)