Search code examples
pythonmachine-learningscikit-learntraining-dataanomaly-detection

Python: How to fit a model with user defined functions


I'm working on the isolation forest. I implemented this code in order to buid isolation forest that contain iTrees.

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

class ExNode:
    def __init__(self,size):
        self.size=size


class InNode:
    def __init__(self,left,right,splitAtt,splitVal):
        self.left=left
        self.right=right
        self.splitAtt=splitAtt
        self.splitVal=splitVal

def iForest(X,noOfTrees,sampleSize):
forest=[]

hlim=int(np.ceil(np.log2(max(sampleSize, 2))))
for i in range(noOfTrees):
    X_train=X.sample(sampleSize)
    forest.append(iTree(X_train,0,hlim))
return forest


def iTree(X,currHeight,hlim):
if currHeight>=hlim or len(X)<=1:
    return ExNode(len(X))
else:
    Q=X.columns
    q=random.choice(Q)
    p=random.choice(X[q].unique())
    X_l=X[X[q]<p]
    X_r=X[X[q]>=p]
    return InNode(iTree(X_l,currHeight+1,hlim),iTree(X_r,currHeight+1,hlim),q,p)

def pathLength(x,Tree,currHeight):
if isinstance(Tree,ExNode):
    return currHeight
a=Tree.splitAtt
if x[a]<Tree.splitVal:
    return pathLength(x,Tree.left,currHeight+1)
else:
    return pathLength(x,Tree.right,currHeight+1)


def _h(i):
    return np.log2(i) + 0.5772156649 

def _c(n):
    if n > 2:
        h = _h(n-1)
        return 2*h - (2*(n - 1)/n)
    if n == 2:
        return 1
    else:
        return 0


def _anomaly_score(score, n_samples):
    score = -score/_c(n_samples)
    return 2**score

df=pd.read_csv("db.csv")
y_true=df['Target']
df_data=df.drop('Target',1)
sampleSize=256
X_train, X_test, y_train, y_test = train_test_split(df_data, y_true, test_size=0.3)
ifor=iForest(X_train,100,sampleSize)

for index, row in test.iterrows():    
    sxn = 0;
    testLenLst = []
    for tree in ifor:
        testLenLst.append(pathLength(row,tree,0))             
    if(len(testLenLst) != 0):
        ehx = (sum(testLenLst) / float(len(testLenLst)))  
        if(_anomaly_score(ehx,sampleSize) >= .5):
            print("Anomaly S(x,n) " + str(_anomaly_score(ehx,sampleSize)))
        else:
            print("Normal S(x,n) " + str(_anomaly_score(ehx,sampleSize)))

In fact, the real problem is that i want to display an iTree. In order to do that, i use the function .fit() to build the model. But .fit () only works on models built from predefined algorithms on python. Whereas in my case, it was me who developed the isolation forest algorithm. Below is how I tried the model construction as well as the display of the iTree.

from sklearn.tree import export_graphviz
ifor.fit(X_train)
estimator = ifor.tree[1]

export_graphviz(estimator, 
                out_file='tree.dot', 
                feature_names = df.feature_names,
                class_names = df.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

from subprocess import call

call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image 
Image(filename = 'tree.png')

It shows me the following error: The error i get when i try to display an iTree


Solution

  • your question is unclear but best pratices are to follow How to write a custom estimator in sklearn and use cross-validation on it? to write a custom estimator and write an implementation of fit() method with appropriate rules, else it can be very confusing,

    As Python use duck typing, try avoid this complication and use sklearn.BaseEstimator