Example 1: Multivariate feature imputation
# Multivariate feature imputation
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])
# IterativeImputer(random_state=0)
X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
# the model learns that the second feature is double the first
print(np.round(imp.transform(X_test)))
# [[ 1. 2.]
# [ 6. 12.]
# [ 3. 6.]]
Example 2: Marking imputed values
# Marking imputed values
from sklearn.impute import MissingIndicator
X = np.array([[-1, -1, 1, 3]),
[4, -1, 0, -1],
[8, -1, 1, 0]])
indicator = MissingIndicator(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only
# array([[ True, True, False],
# [False, True, True],
# [False, True, False]])
indicator.features_
# array([0, 1, 3])
indicator = MissingIndicator(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
mask_all
# array([[ True, True, False, False],
# [False, True, False, True],
# [False, True, False, False]])
indicator.features_
# array([0, 1, 2, 3])
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier
X, y = load_iris(return_X_y=True)
mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
X[mask] = np.nan
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
random_state=0)
transformer = FeatureUnion(
transformer_list=[
('features', SimpleImputer(strategy='mean')),
('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
results.shape
# (100, 8)
clf = make_pipeline(transformer, DecisionTreeClassifier())
clf = clf.fit(X_train, y_train)
results = clf.predict(X_test)
results.shape
# (100,)