random forest from scratch python github code example

Example: random forest from scratch python github

import numpy as np
from collections import Counter

#implement decision tree
def entropy(d):
    c = np.bincount(d)
    c = c[c != 0]
    prop = c/len(d)
    E = - prop*np.log2(prop)
    return np.sum(E)

class Node:
    def __init__(self, beast_split_feature = None, threashold = None, left = None,right = None,*,value = None):
        self.beast_split_feature = beast_split_feature
        self.threashold  = threashold
        self.left = left
        self.right = right
        self.value = value
    def is_leaf(self):
        return self.value is not None



class Tree:
    def __init__(self, min_split_sample = 2, max_depth = 100, n_feat = None):
        self.min_split_sample = min_split_sample
        self.max_depth = max_depth
        self.n_feat = n_feat
        self.root = None

    def fit(self, X,y):
        _, n_features = X.shape
        self.n_feat = n_features if  not self.n_feat else min(n_features, self.n_feat)
        self.root = self._grow_tree(X,y)


    def predict(self, X):
        return np.array([self._trav(x, self.root) for x in X])
    
    def _trav(self, x, node):
        if node.is_leaf():
            return node.value
        elif x[node.beast_split_feature] <= node.threashold:
            return self._trav(x, node.left)
        return self._trav(x, node.right)


    def _grow_tree(self, X, y, depth = 0):
        _, n_features = X.shape
        n_classes = len(np.unique(y))

        #stopping cratiria
        if (depth >= self.max_depth or n_classes <= self.min_split_sample
        or n_classes == 1):
            leaf_val = self._most_commen_val(y)
            return Node(value=leaf_val)

        feat_idx = np.random.choice(n_features, self.n_feat, replace=False)

        best_threash, best_feat = self._best_criteria(X, y, feat_idx)
        left_idx, right_idx = self._split(X[:, best_feat] ,best_threash)

        righTree = self._grow_tree(X[right_idx, :], y[right_idx], depth + 1)
        lefTree = self._grow_tree(X[left_idx, :], y[left_idx], depth + 1)
        return Node(best_feat, best_threash, lefTree, righTree)



    
    def _best_criteria(self, X, y, feat_idxs):
        best_gian = -1
        split_idx, split_thrishold = None, None

        for feat_idx in feat_idxs:
            X_cloumnd = X[:, feat_idx]
            thrisholds = np.unique(X_cloumnd)
            for thrishold in thrisholds:
                gain = self._info_gain(y, X_cloumnd, thrishold)

                if gain > best_gian:
                    best_gian = gain
                    split_idx = feat_idx
                    split_thrishold = thrishold
        return split_thrishold, split_idx
    
    def _info_gain(self, y, X_cloumnd, thrishold):
        #perant entropy
        perant_entropy = entropy(y)
        #split
        left_idx, right_idx = self._split(X_cloumnd, thrishold)
        if len(left_idx) ==0  or len(right_idx) ==0:
            return 0
        left_label = y[left_idx]
        right_label =  y[right_idx]
        l_n, r_n = len(left_label), len(right_label)
        child_entropy = (l_n/len(y))*entropy(left_label) + (r_n/len(y))*entropy(right_label)

        #wighted avg
        return perant_entropy - child_entropy
    
    def _split(self, X_cloumnd, thrishold):
        left = np.argwhere(X_cloumnd < thrishold).flatten()
        right = np.argwhere(X_cloumnd >= thrishold).flatten()
        return left, right
    
    def _most_commen_val(self, y):
        c = Counter(y)
        most = c.most_common(1)
        return most[0][0]

# implement Random Forest
def bootstrab(X, y):
    n_sample = X.shape[0]
    idexs = np.random.choice(n_sample,n_sample, replace=True)
    return X[idexs], y[idexs]

def most_commen_val(y):
    c = Counter(y)
    most = c.most_common(1)
    return most[0][0]

class RandomForst:

    def __init__(self, num_tree = 100, min_split_sample = 2, max_depth = 100, n_feat = None):
        self.num_tree = num_tree
        self.min_split_sample = min_split_sample
        self.max_depth = max_depth
        self.n_feat = n_feat

    def fit(self, X, y):
        self.tree = []
        for _ in range(self.num_tree):
            tree = Tree(self.min_split_sample, self.max_depth, self.n_feat)
            x_sample, y_sample  = bootstrab(X, y)
            tree.fit(x_sample, y_sample)
            self.tree.append(tree)
        
    def predict(self, X):
        tree_predict = np.array([tree.predict(X) for tree in self.tree])
        tree_predict = np.swapaxes(tree_predict, 0, 1)
        y_pred = [most_commen_val(y) for y in tree_predict]
        return y_pred

Tags:

Misc Example