python impute missing values code example

Example 1: replace missing values, encoded as np.nan, using the mean value of the columns

# Univariate feature imputation

import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
# SimpleImputer()
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))
# [[4.          2.        ]
#  [6.          3.666...]
#  [7.          6.        ]]

# SimpleImputer class also supports categorical data

import pandas as pd
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["a", np.nan],
                   ["b", "y"]], dtype="category")

imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df))
# [['a' 'x']
#  ['a' 'y']
#  ['a' 'y']
#  ['b' 'y']]

Example 2: how to check missing values in python

# Total missing values for each featureprint df.isnull().sum()Out:ST_NUM          2ST_NAME         0OWN_OCCUPIED    2NUM_BEDROOMS    4

Example 3: Marking imputed values

# Marking imputed values

from sklearn.impute import MissingIndicator
X = np.array([[-1, -1, 1, 3]),
              [4, -1, 0, -1],
              [8, -1, 1, 0]])
indicator = MissingIndicator(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only
# array([[ True,  True, False],
#        [False,  True,  True],
#        [False,  True, False]])

indicator.features_
# array([0, 1, 3])

indicator = MissingIndicator(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
mask_all
# array([[ True,  True, False, False],
#        [False,  True, False,  True],
#        [False,  True, False, False]])
indicator.features_
# array([0, 1, 2, 3])

from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier
X, y = load_iris(return_X_y=True)
mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
X[mask] = np.nan
  X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
                                                 random_state=0)
  
transformer = FeatureUnion(
  transformer_list=[
    ('features', SimpleImputer(strategy='mean')),
    ('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
results.shape
# (100, 8)

clf = make_pipeline(transformer, DecisionTreeClassifier())
clf = clf.fit(X_train, y_train)
results = clf.predict(X_test)
results.shape
# (100,)