Python 即使在分层聚类之后,具有相关特征的特征重要性仍然很低

Python 即使在分层聚类之后,具有相关特征的特征重要性仍然很低,python,scikit-learn,Python,Scikit Learn,我有一个处理相关数据的神经网络。我想使用sklearn排列特征重要性输出特征重要性,如。即使这样,即使我的NN性能很好,我得到的数字也很低。我不知道该怎么开始修理这个。我的代码: def run(): torch.multiprocessing.freeze_support() dataset = pd.read_excel("dataset_withgender.xlsx") target = dataset['ProteinX'] da

我有一个处理相关数据的神经网络。我想使用sklearn排列特征重要性输出特征重要性,如。即使这样,即使我的NN性能很好,我得到的数字也很低。我不知道该怎么开始修理这个。我的代码:

def run():
    torch.multiprocessing.freeze_support()
 
    dataset = pd.read_excel("dataset_withgender.xlsx")
    target = dataset['ProteinX']
    dataset = dataset.iloc[:, 1:9154]
    X_train,X_test,y_train,y_test = train_test_split(dataset.values.astype(np.float32),
                                                 target.values.reshape(-1).astype(np.float32),
                                                 test_size=.3,
                                                random_state=42)

 
    class MultiLayerPredictor(torch.nn.Module):
        def __init__(self, input_shape = 8778, output_shape=1, hidden_dim=1024, **kwargs):
            super().__init__()
            self.fc1 = torch.nn.Linear(in_features=input_shape, out_features=hidden_dim)
            self.fc2 = torch.nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
            self.fc3 = torch.nn.Linear(in_features=hidden_dim, out_features=output_shape)
 
        def forward(self, x):
            l1 = torch.relu(self.fc1(x))
            l2 = torch.relu(self.fc2(l1))
            return torch.sigmoid(self.fc3(l2)).reshape(-1)
 
    net = NeuralNet(
        MultiLayerPredictor,
        criterion=nn.MSELoss,
        max_epochs=10,
        optimizer=optim.Adam,
        lr=0.1,
        iterator_train__shuffle=False
    )
 
 
    lr = (10**np.random.uniform(-5,-2.5,1000)).tolist()
    params = {
        'optimizer__lr': lr,
        'max_epochs':[300,400,500],
        'module__num_units': [14,20,28,36,42],
        'module__drop' : [0,.1,.2,.3,.4]
    }
 
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
    corr = spearmanr(dataset).correlation
    corr_linkage = hierarchy.ward(corr)
    dendro = hierarchy.dendrogram(
    corr_linkage, labels=dataset.columns.values.tolist(), ax=ax1, leaf_rotation=90)
    dendro_idx = np.arange(0, len(dendro['ivl']))
 
    ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
    ax2.set_xticks(dendro_idx)
    ax2.set_yticks(dendro_idx)
    ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
    ax2.set_yticklabels(dendro['ivl'])
    fig.tight_layout()
 
    cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance')
    cluster_id_to_feature_ids = defaultdict(list)
    for idx, cluster_id in enumerate(cluster_ids):
        cluster_id_to_feature_ids[cluster_id].append(idx)
    selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
 
 
    print("Set feature sets")
    X_train_sel = X_train[:, selected_features]
    X_test_sel = X_test[:, selected_features]
 
 
    net_selected = net2
    gs_sel = RandomizedSearchCV(net_selected,params,refit=True,cv=3,scoring='neg_mean_squared_error',n_iter=2)
    gs_sel.fit(X_train_sel, y_train)

    mkaccuracy_score = make_scorer(mean_squared_error)
    
    results = permutation_importance(gs_sel, X_train_sel, y_train, n_repeats=10,random_state=42, scoring=mkaccuracy_score)
    sorted_idx = results.importances_mean.argsort()
 

 
    f = open("results_2.txt", "a")
    for i in results.importances_mean.argsort()[::-1]:
        if results.importances_mean[i] - 2 * results.importances_std[i] > 0:
                f.write(f"{dataset.columns[i]:<8}"
                        f" "
                        f"scoring"
                        f"{results.importances_mean[i]:.3f}"
                        f" +/- {results.importances_std[i]:.3f}")
    
    f.close()
 
 
if __name__ == '__main__':
    run()

def run():
torch.multiprocessing.freeze_support()
dataset=pd.read\u excel(“dataset\u with gender.xlsx”)
目标=数据集['ProteinX']
dataset=dataset.iloc[:,1:9154]
X_序列,X_测试,y_序列,y_测试=序列测试分割(dataset.values.astype(np.float32),
target.values.reformate(-1).astype(np.float32),
测试尺寸=0.3,
随机状态=42)
类多层预测器(torch.nn.Module):
def uuuu init uuuuuuuuuuuuuuuu(自,输入u形状=8778,输出u形状=1,隐藏u尺寸=1024,**kwargs):
super()。\uuuu init\uuuuu()
self.fc1=torch.nn.Linear(内部特征=输入形状,外部特征=隐藏尺寸)
self.fc2=torch.nn.Linear(内部特征=隐藏尺寸,外部特征=隐藏尺寸)
self.fc3=torch.nn.Linear(内部特征=隐藏尺寸,外部特征=输出形状)
def前进(自身,x):
l1=火炬释放装置(自身fc1(x))
l2=火炬释放装置(自身fc2(l1))
返回火炬。乙状结肠(自身fc3(l2))。重塑(-1)
网络=神经网络(
多层预测器,
标准=nn.mse损失,
最大纪元=10,
优化器=optim.Adam,
lr=0.1,
迭代器\u训练\u洗牌=False
)
lr=(10**np.随机.均匀(-5,-2.51000)).tolist()
参数={
“optimizer\uuulr”:lr,
“最大时代”:[300400500],
“模块数量单位”:[14,20,28,36,42],
“模块丢弃”:[0、.1、.2、.3、.4]
}
图(ax1,ax2)=plt.子批次(1,2,figsize=(12,8))
corr=spearmanr(数据集)。相关性
corr_linkage=hierarchy.ward(corr)
dendro=层次结构。树状图(
corr_linkage,labels=dataset.columns.values.tolist(),ax=ax1,leaf_rotation=90)
dendro_idx=np.arange(0,len(dendro['ivl']))
ax2.imshow(corr[dendro['leaves',:][:,dendro['leaves']))
ax2.set_xticks(dendro_idx)
ax2.设置锁定(dendro\u idx)
ax2.set_xticklabels(dendro['ivl'],rotation='vertical')
ax2.设置标签(dendro['ivl'])
图1紧_布局图()
cluster_id=hierarchy.fcluster(corr_linkage,1,criteria='distance'))
群集\u id\u到\u功能\u id=defaultdict(列表)
对于idx,枚举中的群集id(群集id):
群集\u id \u到\u功能\u id[群集\u id]。追加(idx)
所选的_功能=[v[0]用于集群中的v_id_到_功能_id.values()]
打印(“设置要素集”)
X_列_选择=X_列[:,选定的_特征]
X_测试_选择=X_测试[:,选定的_特征]
所选网络=网络2
gs_sel=RandomizedSearchCV(选择的净值,参数,重新调整=True,cv=3,评分='neg_均方误差',n_iter=2)
gs_选择配合(X_系列选择,y_系列)
MkAccurance\u score=得分者(均方误差)
结果=排列重要性(gs_选择、X_序列选择、y_序列、n_重复次数=10次、随机状态=42次、评分=MKAccurance评分)
sorted_idx=results.importances_mean.argsort()
f=打开(“结果_2.txt”,“a”)
对于结果中的i.importances_mean.argsort()[::-1]:
如果结果重要性是指[i]-2*结果重要性标准[i]>0:
f、 写入(f{dataset.columns[i]: