python中的隔离林算法_Python_R_Algorithm_Random Forest

python中的隔离林算法

python r algorithm

python中的隔离林算法,python,r,algorithm,random-forest,Python,R,Algorithm,Random Forest,我试图用python重现隔离林论文中描述的算法这是我当前的代码： import numpy as np import sklearn as sk import matplotlib.pyplot as plt import pandas as pd from sklearn.decomposition import PCA def _h(i): return np.log(i) + 0.5772156649 def _c(n): if n > 2:

我试图用python重现隔离林论文中描述的算法

这是我当前的代码：

import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA


def _h(i):
    return np.log(i) + 0.5772156649 


def _c(n):
    if n > 2:
        h = _h(n-1)
        return 2*h - 2*(n - 1)/n
    if n == 2:
        return 1
    else:
        return 0


def _anomaly_score(dict_scores, n_samples):
    score = np.array([np.mean(dict_scores[k]) for k in dict_scores.keys()])
    score = -score/_c(n_samples)

    return 2**score


def _split_data(X):
    ''' split the data in the left and right nodes ''' 
    n_samples, n_columns = X.shape
    n_features = n_columns - 1

    feature_id = np.random.randint(low=0, high=n_features-1)
    feature = X[:, feature_id]
    split_value = np.random.choice(feature)
    left_X = X[feature <= split_value]
    right_X = X[feature > split_value]
    return left_X, right_X, feature_id, split_value


def iTree(X, add_index=False, max_depth = np.inf):            
    ''' construct an isolation tree and returns the number of step required
    to isolate an element. A column of index is added to the input matrix X if  
    add_index=True. This column is required in the algorithm. ''' 

    n_split = {} 
    def iterate(X, count = 0):

        n_samples, n_columns = X.shape
        n_features = n_columns - 1

        if count > max_depth:
            for index in X[:,-1]:
                n_split[index] = count
            return

        if n_samples == 1:
            index = X[0, n_columns-1]
            n_split[index] = count
            return 
        else:
            lX, rX, feature_id, split_value = _split_data(X)
            # Uncomment the print to visualize a draft of 
            # the construction of the tree
            #print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
            n_samples_lX, _ = lX.shape
            n_samples_rX, _ = rX.shape
            if n_samples_lX > 0:
                iterate(lX, count+1)
            if n_samples_rX >0:
                iterate(rX, count+1)

    if add_index:
        n_samples, _ = X.shape
        X = np.c_[X, range(n_samples)]

    iterate(X)
    return n_split


class iForest():
    ''' Class to construct the isolation forest.

    -n_estimators: is the number of trees in the forest,

    -sample_size: is the bootstrap parameter used during the construction
    of the forest,

    -add_index: adds a column of index to the matrix X. This is required and 
    add_index can be set to False only if the last column of X contains 
    already indeces.

    -max_depth: is the maximum depth of each tree
    '''
    def __init__(self, n_estimators=20, sample_size=None, add_index = True, 
                 max_depth = 100):
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.add_index = add_index
        self.max_depth = max_depth
        return 

    def fit(self, X):
        n_samples, n_features = X.shape
        if self.sample_size == None:
            self.sample_size = int(n_samples/2)

        if self.add_index:
            X = np.c_[X, range(n_samples)]


        trees = [iTree(X[np.random.choice(n_samples, 
                                          self.sample_size, 
                                          replace=False)],
                       max_depth=self.max_depth) 
                 for i in range(self.n_estimators)]

        self.all_anomaly_score_ = {k:None for k in range(n_samples)}
        for k in self.all_anomaly_score_.keys():
            self.all_anomaly_score_[k] = np.array([tree[k] 
                                                   for tree in trees 
                                                   if k in tree])

        self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, n_samples)
        return self

sol

通常与使用R软件获得的正确解决方案不同。

通过以下方法获得了R中的正确溶液：

> tr = IsolationTrees(stackloss,ntree = 100000,hlim = 100, rFactor = 1)
> as = AnomalyScore(stackloss, tr)
> order(as$outF)
 [1] 11  6  5  9 13 10 12 18  7 20  8 15 14 16 19  4 21 17  3  2  1
> order(as$outF)-1
 [1] 10  5  4  8 12  9 11 17  6 19  7 14 13 15 18  3 20 16  2  1  0
>

错误在哪里？

scikit学习中有一个拉取请求：

您正在使用n个样本计算_异常_分数，n个样本是样本总数。但是，您正在使用子样本构建树。因此，在计算平均搜索长度“_c（n）”时，应该使用样本大小而不是n个样本，因为树是用子样本构建的。因此，我认为您的代码应该是：

self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, self.sample_size)

我终于解决了这个问题。由于在数据的每个分割中执行了连续复制操作，因此代码仍然很慢

这是算法的工作版本

import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd


def _h(i):
    return np.log(i) + 0.5772156649 


def _c(n):
    if n > 2:
        h = _h(n-1)
        return 2*h - 2*(n - 1)/n
    if n == 2:
        return 1
    else:
        return 0


def _anomaly_score(score, n_samples):

    score = -score/_c(n_samples)

    return 2**score


def _split_data(X):
    ''' split the data in the left and right nodes ''' 
    n_samples, n_columns = X.shape
    n_features = n_columns - 1
    m = M = 0
    while m == M:
        feature_id = np.random.randint(low=0, high=n_features)
        feature = X[:, feature_id]
        m = feature.min()
        M = feature.max()
        #print(m, M, feature_id, X.shape)

    split_value = np.random.uniform(m, M, 1)
    left_X = X[feature <= split_value]
    right_X = X[feature > split_value]
    return left_X, right_X, feature_id, split_value


def iTree(X, add_index=False, max_depth = np.inf):            
    ''' construct an isolation tree and returns the number of step required
    to isolate an element. A column of index is added to the input matrix X if  
    add_index=True. This column is required in the algorithm. ''' 

    n_split = {} 
    def iterate(X, count = 0):

        n_samples, n_columns = X.shape
        n_features = n_columns - 1

        if count > max_depth:
            for index in X[:,-1]:
                n_split[index] = count
            return

        if n_samples == 1:
            index = X[0, n_columns-1]
            n_split[index] = count
            return 
        else:
            lX, rX, feature_id, split_value = _split_data(X)
            # Uncomment the print to visualize a draft of 
            # the construction of the tree
            #print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
            n_samples_lX, _ = lX.shape
            n_samples_rX, _ = rX.shape
            if n_samples_lX > 0:
                iterate(lX, count+1)
            if n_samples_rX >0:
                iterate(rX, count+1)

    if add_index:
        n_samples, _ = X.shape
        X = np.c_[X, range(n_samples)]

    iterate(X)
    return n_split


class iForest():
    ''' Class to construct the isolation forest.

    -n_estimators: is the number of trees in the forest,

    -sample_size: is the bootstrap parameter used during the construction
    of the forest,

    -add_index: adds a column of index to the matrix X. This is required and 
    add_index can be set to False only if the last column of X contains 
    already indeces.

    -max_depth: is the maximum depth of each tree
    '''
    def __init__(self, n_estimators=20, sample_size=None, add_index = True, 
                 max_depth = 100):
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.add_index = add_index
        self.max_depth = max_depth
        return 

    def fit(self, X):
        n_samples, n_features = X.shape
        if self.sample_size == None:
            self.sample_size = int(n_samples/2)

        if self.add_index:
            X = np.c_[X, range(n_samples)]


        trees = [iTree(X[np.random.choice(n_samples, 
                                          self.sample_size, 
                                          replace=False)],
                       max_depth=self.max_depth) 
                 for i in range(self.n_estimators)]

        self.path_length_ = {k:None for k in range(n_samples)}
        for k in self.path_length_.keys():
            self.path_length_[k] = np.array([tree[k] 
                                             for tree in trees 
                                             if k in tree])
        self.path_length_ = np.array([self.path_length_[k].mean() for k in 
                                      self.path_length_.keys()])
        self.anomaly_score_ = _anomaly_score(self.path_length_, self.sample_size)
        return self

将numpy导入为np
导入sklearn作为sk
将matplotlib.pyplot作为plt导入
作为pd进口熊猫
定义h（i）：
返回np.log（i）+0.5772156649
def_c（n）：
如果n>2：
h=_h（n-1）
返回2*h-2*（n-1）/n
如果n==2：
返回1
其他：
返回0
def异常评分（评分，n个样本）：
分数=-分数/_c（n_样本）
返回2**分
定义分割数据（X）：
''在左侧和右侧节点中拆分数据''
n_个样本，n_个列=X.shape
n_特征=n_列-1
m=m=0
当m==m时：
特征id=np.random.randint（低=0，高=n个特征）
feature=X[：，feature_id]
m=feature.min（）
M=feature.max（）
#打印（m、m、特征标识、X形状）
分割值=np.随机.均匀（m，m，1）
左_X=X[特征分割_值]
返回左、右、特征id、分割值
def iTree（X，add_index=False，max_depth=np.inf）：
''构造一个隔离树并返回所需的步骤数
隔离一个元素。如果出现以下情况，则向输入矩阵X添加一列索引
add_index=True。算法中需要此列。“”
n_split={}
def迭代（X，计数=0）：
n_个样本，n_个列=X.shape
n_特征=n_列-1
如果计数>最大深度：
对于X[：，-1]中的索引：
n_分割[索引]=计数
返回
如果n_samples==1：
索引=X[0，n_列-1]
n_分割[索引]=计数
返回
其他：
lX，rX，特征标识，分割值=\u分割数据（X）
#取消对打印的注释以可视化打印的草稿
#树的构造
#打印（lX[：，-1]，rX[：，-1]，特征id，分割值，n分割）
n_样本_lX，x=lX.shape
n\u样本\u rX，\ux=rX.shape
如果n_样本数_lX>0：
迭代（lX，计数+1）
如果n_样本数_rX>0：
迭代（接收，计数+1）
如果添加索引：
n_样本，u=X.shape
X=np.c_X，范围（n个样本）]
迭代（X）
返回n_分割
类iForest（）：
''类来构造隔离林。
-n_估计器：是森林中的树木数量，
-示例大小：是构建过程中使用的引导参数
在森林里，
-添加索引：向矩阵X添加一列索引。这是必需的，并且
只有在X的最后一列包含
已经有索引了。
-最大深度：是每棵树的最大深度
'''
def uuu init uuuu（self，n_估计值=20，样本大小=None，add_index=True，
最大深度=100）：
self.n_估计量=n_估计量
self.sample\u size=样本大小
self.add\u index=add\u index
self.max\u depth=最大深度
返回
def配合（自身，X）：
n_样本，n_特征=X形状
如果self.sample_size==无：
self.sample\u size=int（n\u样本/2）
如果self.add\u索引：
X=np.c_X，范围（n个样本）]
trees=[iTree（X[np.random.choice（n_样本，
自我样本大小，
replace=False）]，
最大深度=自身最大深度）
对于范围内的i（自n_估计量）]
self.path_length={k:n范围内k的无（n_样本）}
对于self.path\u length\u.keys（）中的k：
self.path\u length\uk]=np.array（[tree[k]
树中树
如果树中有k]）
self.path\u length\uu=np.array（[self.path\u length\uk].mean（）表示k in
self.path_length_u.keys（））
self.normality\u score\uu=\u normality\u score（self.path\u length\uu，self.sample\u size）
回归自我

Donbeo，您的代码只需进行一些小的调整就可以很好地工作，它的主要问题是您错过了递归算法的一个基本情况（结束条件），因此当出现该条件时，它会挂起在循环中。您需要在_split_data函数（如下代码所示）中实现这一效果，并在iterate函数（未显示）中处理这种情况

谢谢这将非常有用。现在我只想知道我做错了什么：-）非常好的地方。这是一个错误，但我不认为这是问题的原因。21是数据的总大小，因此在本例中

n\u samples=sample\u size

您是说您的解决方案总是比R的解决方案小1？或者你的解决方案完全不同？当我多次运行代码时，输出是不一样的（由于随机拆分，输出中的相似性随着更多树的增加而增加）。我认为我的解决方案总是不同的。今天下午，我将发布代码，以便与R脚本进行更简单的比较。我在R中报告了结果-1，以考虑到R和Python中不同的指标体系问题是什么？

self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, self.sample_size)

import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd


def _h(i):
    return np.log(i) + 0.5772156649 


def _c(n):
    if n > 2:
        h = _h(n-1)
        return 2*h - 2*(n - 1)/n
    if n == 2:
        return 1
    else:
        return 0


def _anomaly_score(score, n_samples):

    score = -score/_c(n_samples)

    return 2**score


def _split_data(X):
    ''' split the data in the left and right nodes ''' 
    n_samples, n_columns = X.shape
    n_features = n_columns - 1
    m = M = 0
    while m == M:
        feature_id = np.random.randint(low=0, high=n_features)
        feature = X[:, feature_id]
        m = feature.min()
        M = feature.max()
        #print(m, M, feature_id, X.shape)

    split_value = np.random.uniform(m, M, 1)
    left_X = X[feature <= split_value]
    right_X = X[feature > split_value]
    return left_X, right_X, feature_id, split_value


def iTree(X, add_index=False, max_depth = np.inf):            
    ''' construct an isolation tree and returns the number of step required
    to isolate an element. A column of index is added to the input matrix X if  
    add_index=True. This column is required in the algorithm. ''' 

    n_split = {} 
    def iterate(X, count = 0):

        n_samples, n_columns = X.shape
        n_features = n_columns - 1

        if count > max_depth:
            for index in X[:,-1]:
                n_split[index] = count
            return

        if n_samples == 1:
            index = X[0, n_columns-1]
            n_split[index] = count
            return 
        else:
            lX, rX, feature_id, split_value = _split_data(X)
            # Uncomment the print to visualize a draft of 
            # the construction of the tree
            #print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
            n_samples_lX, _ = lX.shape
            n_samples_rX, _ = rX.shape
            if n_samples_lX > 0:
                iterate(lX, count+1)
            if n_samples_rX >0:
                iterate(rX, count+1)

    if add_index:
        n_samples, _ = X.shape
        X = np.c_[X, range(n_samples)]

    iterate(X)
    return n_split


class iForest():
    ''' Class to construct the isolation forest.

    -n_estimators: is the number of trees in the forest,

    -sample_size: is the bootstrap parameter used during the construction
    of the forest,

    -add_index: adds a column of index to the matrix X. This is required and 
    add_index can be set to False only if the last column of X contains 
    already indeces.

    -max_depth: is the maximum depth of each tree
    '''
    def __init__(self, n_estimators=20, sample_size=None, add_index = True, 
                 max_depth = 100):
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.add_index = add_index
        self.max_depth = max_depth
        return 

    def fit(self, X):
        n_samples, n_features = X.shape
        if self.sample_size == None:
            self.sample_size = int(n_samples/2)

        if self.add_index:
            X = np.c_[X, range(n_samples)]


        trees = [iTree(X[np.random.choice(n_samples, 
                                          self.sample_size, 
                                          replace=False)],
                       max_depth=self.max_depth) 
                 for i in range(self.n_estimators)]

        self.path_length_ = {k:None for k in range(n_samples)}
        for k in self.path_length_.keys():
            self.path_length_[k] = np.array([tree[k] 
                                             for tree in trees 
                                             if k in tree])
        self.path_length_ = np.array([self.path_length_[k].mean() for k in 
                                      self.path_length_.keys()])
        self.anomaly_score_ = _anomaly_score(self.path_length_, self.sample_size)
        return self

minv = maxv = 0
inspected = Set()   # this set tracks the candidates that we already inspected
while minv == maxv:

    # check whether we run out of features to try an none of them has different values,
    # if that is the case we need to break the loop otherwise this loops forever
    if len(inspected) == n_features:
        # if we run out of features to try an none of them has different values,
        # return -1 to signal the caller that we can't split X anymore.
        return X, X, -1, None

    feature_id = np.random.randint(low=0, high=n_features)
    if feature_id not in inspected:
        inspected.add(feature_id)
        split_feature = X[:, feature_id]
        minv = split_feature.min()
        maxv = split_feature.max()