python中的隔离林算法
我试图用python重现隔离林论文中描述的算法 这是我当前的代码:python中的隔离林算法,python,r,algorithm,random-forest,Python,R,Algorithm,Random Forest,我试图用python重现隔离林论文中描述的算法 这是我当前的代码: import numpy as np import sklearn as sk import matplotlib.pyplot as plt import pandas as pd from sklearn.decomposition import PCA def _h(i): return np.log(i) + 0.5772156649 def _c(n): if n > 2:
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(dict_scores, n_samples):
score = np.array([np.mean(dict_scores[k]) for k in dict_scores.keys()])
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
feature_id = np.random.randint(low=0, high=n_features-1)
feature = X[:, feature_id]
split_value = np.random.choice(feature)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.all_anomaly_score_ = {k:None for k in range(n_samples)}
for k in self.all_anomaly_score_.keys():
self.all_anomaly_score_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, n_samples)
return self
sol
通常与使用R软件获得的正确解决方案不同。
通过以下方法获得了R中的正确溶液:
> tr = IsolationTrees(stackloss,ntree = 100000,hlim = 100, rFactor = 1)
> as = AnomalyScore(stackloss, tr)
> order(as$outF)
[1] 11 6 5 9 13 10 12 18 7 20 8 15 14 16 19 4 21 17 3 2 1
> order(as$outF)-1
[1] 10 5 4 8 12 9 11 17 6 19 7 14 13 15 18 3 20 16 2 1 0
>
错误在哪里?scikit学习中有一个拉取请求: 您正在使用n个样本计算_异常_分数,n个样本是样本总数。但是,您正在使用子样本构建树。因此,在计算平均搜索长度“_c(n)”时,应该使用样本大小而不是n个样本,因为树是用子样本构建的。因此,我认为您的代码应该是:
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, self.sample_size)
我终于解决了这个问题。 由于在数据的每个分割中执行了连续复制操作,因此代码仍然很慢 这是算法的工作版本
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(score, n_samples):
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
m = M = 0
while m == M:
feature_id = np.random.randint(low=0, high=n_features)
feature = X[:, feature_id]
m = feature.min()
M = feature.max()
#print(m, M, feature_id, X.shape)
split_value = np.random.uniform(m, M, 1)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.path_length_ = {k:None for k in range(n_samples)}
for k in self.path_length_.keys():
self.path_length_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.path_length_ = np.array([self.path_length_[k].mean() for k in
self.path_length_.keys()])
self.anomaly_score_ = _anomaly_score(self.path_length_, self.sample_size)
return self
将numpy导入为np
导入sklearn作为sk
将matplotlib.pyplot作为plt导入
作为pd进口熊猫
定义h(i):
返回np.log(i)+0.5772156649
def_c(n):
如果n>2:
h=_h(n-1)
返回2*h-2*(n-1)/n
如果n==2:
返回1
其他:
返回0
def异常评分(评分,n个样本):
分数=-分数/_c(n_样本)
返回2**分
定义分割数据(X):
''在左侧和右侧节点中拆分数据''
n_个样本,n_个列=X.shape
n_特征=n_列-1
m=m=0
当m==m时:
特征id=np.random.randint(低=0,高=n个特征)
feature=X[:,feature_id]
m=feature.min()
M=feature.max()
#打印(m、m、特征标识、X形状)
分割值=np.随机.均匀(m,m,1)
左_X=X[特征分割_值]
返回左、右、特征id、分割值
def iTree(X,add_index=False,max_depth=np.inf):
''构造一个隔离树并返回所需的步骤数
隔离一个元素。如果出现以下情况,则向输入矩阵X添加一列索引
add_index=True。算法中需要此列。“”
n_split={}
def迭代(X,计数=0):
n_个样本,n_个列=X.shape
n_特征=n_列-1
如果计数>最大深度:
对于X[:,-1]中的索引:
n_分割[索引]=计数
返回
如果n_samples==1:
索引=X[0,n_列-1]
n_分割[索引]=计数
返回
其他:
lX,rX,特征标识,分割值=\u分割数据(X)
#取消对打印的注释以可视化打印的草稿
#树的构造
#打印(lX[:,-1],rX[:,-1],特征id,分割值,n分割)
n_样本_lX,x=lX.shape
n\u样本\u rX,\ux=rX.shape
如果n_样本数_lX>0:
迭代(lX,计数+1)
如果n_样本数_rX>0:
迭代(接收,计数+1)
如果添加索引:
n_样本,u=X.shape
X=np.c_X,范围(n个样本)]
迭代(X)
返回n_分割
类iForest():
''类来构造隔离林。
-n_估计器:是森林中的树木数量,
-示例大小:是构建过程中使用的引导参数
在森林里,
-添加索引:向矩阵X添加一列索引。这是必需的,并且
只有在X的最后一列包含
已经有索引了。
-最大深度:是每棵树的最大深度
'''
def uuu init uuuu(self,n_估计值=20,样本大小=None,add_index=True,
最大深度=100):
self.n_估计量=n_估计量
self.sample\u size=样本大小
self.add\u index=add\u index
self.max\u depth=最大深度
返回
def配合(自身,X):
n_样本,n_特征=X形状
如果self.sample_size==无:
self.sample\u size=int(n\u样本/2)
如果self.add\u索引:
X=np.c_X,范围(n个样本)]
trees=[iTree(X[np.random.choice(n_样本,
自我样本大小,
replace=False)],
最大深度=自身最大深度)
对于范围内的i(自n_估计量)]
self.path_length={k:n范围内k的无(n_样本)}
对于self.path\u length\u.keys()中的k:
self.path\u length\uk]=np.array([tree[k]
树中树
如果树中有k])
self.path\u length\uu=np.array([self.path\u length\uk].mean()表示k in
self.path_length_u.keys())
self.normality\u score\uu=\u normality\u score(self.path\u length\uu,self.sample\u size)
回归自我
Donbeo,您的代码只需进行一些小的调整就可以很好地工作,它的主要问题是您错过了递归算法的一个基本情况(结束条件),因此当出现该条件时,它会挂起在循环中。您需要在_split_data函数(如下代码所示)中实现这一效果,并在iterate函数(未显示)中处理这种情况
谢谢这将非常有用。现在我只想知道我做错了什么:-)非常好的地方。这是一个错误,但我不认为这是问题的原因。21是数据的总大小,因此在本例中
n\u samples=sample\u size
您是说您的解决方案总是比R的解决方案小1?或者你的解决方案完全不同?当我多次运行代码时,输出是不一样的(由于随机拆分,输出中的相似性随着更多树的增加而增加)。我认为我的解决方案总是不同的。今天下午,我将发布代码,以便与R脚本进行更简单的比较。我在R中报告了结果-1,以考虑到R和Python中不同的指标体系问题是什么?
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, self.sample_size)
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(score, n_samples):
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
m = M = 0
while m == M:
feature_id = np.random.randint(low=0, high=n_features)
feature = X[:, feature_id]
m = feature.min()
M = feature.max()
#print(m, M, feature_id, X.shape)
split_value = np.random.uniform(m, M, 1)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.path_length_ = {k:None for k in range(n_samples)}
for k in self.path_length_.keys():
self.path_length_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.path_length_ = np.array([self.path_length_[k].mean() for k in
self.path_length_.keys()])
self.anomaly_score_ = _anomaly_score(self.path_length_, self.sample_size)
return self
minv = maxv = 0
inspected = Set() # this set tracks the candidates that we already inspected
while minv == maxv:
# check whether we run out of features to try an none of them has different values,
# if that is the case we need to break the loop otherwise this loops forever
if len(inspected) == n_features:
# if we run out of features to try an none of them has different values,
# return -1 to signal the caller that we can't split X anymore.
return X, X, -1, None
feature_id = np.random.randint(low=0, high=n_features)
if feature_id not in inspected:
inspected.add(feature_id)
split_feature = X[:, feature_id]
minv = split_feature.min()
maxv = split_feature.max()