Using GridSearchCV with IsolationForest for finding outliers
You need to create your own scoring function since IsolationForest
does not have score
method inbuilt. Instead you can make use of the score_samples
function that is available in IsolationForest
(can be considered as a proxy for score
) and create your own scorer as described here and pass it to the GridSearchCV
. I have modified your code to do this:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
df = pd.DataFrame({'first': [-112,0,1,28,5,6,3,5,4,2,7,5,1,3,2,2,5,2,42,84,13,43,13],
'second': [42,1,2,85,2,4,6,8,3,5,7,3,64,1,4,1,2,4,13,1,0,40,9],
'third': [3,4,7,74,3,8,2,4,7,1,53,6,5,5,59,0,5,12,65,4,3,4,11],
'result': [5,2,3,0.04,3,4,3,125,6,6,0.8,9,1,4,59,12,1,4,0,8,5,4,1]})
x = df.iloc[:,:-1]
tuned = {'n_estimators':[70,80], 'max_samples':['auto'],
'contamination':['legacy'], 'max_features':[1],
'bootstrap':[True], 'n_jobs':[None,1,2], 'behaviour':['old'],
'random_state':[None,1,], 'verbose':[0,1,2], 'warm_start':[True]}
def scorer_f(estimator, X): #your own scorer
return np.mean(estimator.score_samples(X))
#or you could use a lambda aexpression as shown below
#scorer = lambda est, data: np.mean(est.score_samples(data))
isolation_forest = GridSearchCV(IsolationForest(), tuned, scoring=scorer_f)
model = isolation_forest.fit(x)
SAMPLE OUTPUT
print(model.best_params_)
{'behaviour': 'old',
'bootstrap': True,
'contamination': 'legacy',
'max_features': 1,
'max_samples': 'auto',
'n_estimators': 70,
'n_jobs': None,
'random_state': None,
'verbose': 1,
'warm_start': True}
Hope this helps!