Python 基于类返回numpy数组的分发示例

Python 基于类返回numpy数组的分发示例,python,numpy,machine-learning,sampling,numpy-ndarray,Python,Numpy,Machine Learning,Sampling,Numpy Ndarray,请注意:我和numpy结婚就是为了完成这个任务 我试图编写一个函数来实现以下目标: 将数据集加载到numpy数组中 将数据集拆分为5个“相等”(或尽可能相等)的折叠 对于每个折叠,确保分别有80/20的数据用于培训和测试 这里有一个陷阱。原始输入数据集已“标记”,最后一列包含分类。折叠需要保持与输入集相同的类大小分布 例如,如果我有input=100个样本(行),并且有两个类(由最后一列中的值表示),A和B,具有33%和67%的分割,那么我应该创建5个折叠,每个折叠包含20个样本,其中6或7个样

请注意:我和numpy结婚就是为了完成这个任务

我试图编写一个函数来实现以下目标:

  • 将数据集加载到numpy数组中
  • 将数据集拆分为5个“相等”(或尽可能相等)的折叠
  • 对于每个折叠,确保分别有80/20的数据用于培训和测试
  • 这里有一个陷阱。原始输入数据集已“标记”,最后一列包含分类。折叠需要保持与输入集相同的类大小分布
  • 例如,如果我有
    input=100个样本(行)
    ,并且有两个类(由最后一列中的值表示),A和B,具有33%和67%的分割,那么我应该创建5个折叠,每个折叠包含20个样本,其中6或7个样本是A,13或14个样本是B

    这就是我正在努力实现的目标。我不知道如何正确地确保折叠本身包含类的正确采样分布

    到目前为止,我有以下代码来显示我的尝试。到目前为止,我已经编写了两个函数,它们能够告诉我输入类的分布情况,并且能够创建5个折叠。然而,我需要找到一种方法来组合这些,并创建5个保持各自分布的折叠

    import numpy
    
    def csv_to_array(file):
        # Open the file, and load it in delimiting on the ',' for a comma separated value file
        data = open(file, 'r')
        data = numpy.loadtxt(data, delimiter=',')
    
        # Loop through the data in the array
        for index in range(len(data)):
            # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
            try:
                data[index] = [float(x) for x in data[index]]
            except Exception:
                data[index] = 0
            except ValueError:
                data[index] = 0
    
        # Return the now type-formatted data
        return data
    
    
    def class_distribution(dataset):
        dataset = numpy.asarray(dataset)
        num_total_rows = dataset.shape[0]
        num_columns = dataset.shape[1]
        classes = dataset[:,num_columns-1]
        classes = numpy.unique(classes)
    
        for aclass in classes:
            total = 0
            for row in dataset:
                if numpy.array_equal(aclass, row[-1]):
                    total = total + 1
                else:
                    continue
            print(aclass, " Has: ", ((total/num_total_rows) * 100))
            print(aclass, " : ", total)
    
    
    def create_folds(dataset):
        # print("DATASET", dataset)
        numpy.random.shuffle(dataset)
        num_rows = dataset.shape[0]
        split_mark = int(num_rows / 5)
        folds = []
        fold_sets = []
        temp1 = dataset[:split_mark]
        # print("TEMP1", temp1)
        temp2 = dataset[split_mark:split_mark*2]
        # print("TEMP2", temp2)
        temp3 = dataset[split_mark*2:split_mark*3]
        # print("TEMP3", temp3)
        temp4 = dataset[split_mark*3:split_mark*4]
        # print("TEMP4", temp4)
        temp5 = dataset[split_mark*4:]
        # print("TEMP5", temp5)
        folds.append(temp1)
        folds.append(temp2)
        folds.append(temp3)
        folds.append(temp4)
        folds.append(temp5)
        folds = numpy.asarray(folds)
    
        # print(folds)
    
        return folds
    
    
    def main():
        print("BEGINNING CFV")
        ecoli = csv_to_array('Classification/ecoli.csv')
        # print(len(ecoli))
        class_distribution(ecoli)
        create_folds(ecoli)
    
    main()
    
    下面是我正在使用的csv示例,最后一列表示该类。它是对UCI机器学习存储库中的的的修改:

    0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
    0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
    0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
    0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
    0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
    0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
    0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
    0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
    0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
    0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
    0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
    0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
    0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
    0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
    0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
    0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
    0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
    0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
    0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
    0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
    0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
    0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
    0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
    0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
    0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
    0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
    0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
    0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
    0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
    0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
    0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
    0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
    0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
    0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
    0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
    0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
    0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
    0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
    0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
    0,0.38,0.48,0.5,0.42,0.48,0.55,0
    0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
    0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
    0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
    0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
    0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
    0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
    0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
    0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
    0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
    0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
    0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
    0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
    0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
    0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
    0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
    0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
    0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
    0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
    0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
    0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
    0.52,0.39,0.48,0.5,0.65,0.71,0.73,1
    0.29,0.47,0.48,0.5,0.71,0.65,0.69,1
    0.55,0.47,0.48,0.5,0.57,0.78,0.8,1
    0.12,0.67,0.48,0.5,0.74,0.58,0.63,1
    0.4,0.5,0.48,0.5,0.65,0.82,0.84,1
    0.73,0.36,0.48,0.5,0.53,0.91,0.92,1
    0.84,0.44,0.48,0.5,0.48,0.71,0.74,1
    0.48,0.45,0.48,0.5,0.6,0.78,0.8,1
    0.54,0.49,0.48,0.5,0.4,0.87,0.88,1
    0.48,0.41,0.48,0.5,0.51,0.9,0.88,1
    0.5,0.66,0.48,0.5,0.31,0.92,0.92,1
    0.72,0.46,0.48,0.5,0.51,0.66,0.7,1
    0.47,0.55,0.48,0.5,0.58,0.71,0.75,1
    0.33,0.56,0.48,0.5,0.33,0.78,0.8,1
    0.64,0.58,0.48,0.5,0.48,0.78,0.73,1
    0.11,0.5,0.48,0.5,0.58,0.72,0.68,1
    0.31,0.36,0.48,0.5,0.58,0.94,0.94,1
    0.68,0.51,0.48,0.5,0.71,0.75,0.78,1
    0.69,0.39,0.48,0.5,0.57,0.76,0.79,1
    0.52,0.54,0.48,0.5,0.62,0.76,0.79,1
    0.46,0.59,0.48,0.5,0.36,0.76,0.23,1
    0.36,0.45,0.48,0.5,0.38,0.79,0.17,1
    0,0.51,0.48,0.5,0.35,0.67,0.44,1
    0.1,0.49,0.48,0.5,0.41,0.67,0.21,1
    0.3,0.51,0.48,0.5,0.42,0.61,0.34,1
    0.61,0.47,0.48,0.5,0,0.8,0.32,1
    0.63,0.75,0.48,0.5,0.64,0.73,0.66,1
    0.71,0.52,0.48,0.5,0.64,1,0.99,1
    0.72,0.42,0.48,0.5,0.65,0.77,0.79,2
    0.79,0.41,0.48,0.5,0.66,0.81,0.83,2
    0.83,0.48,0.48,0.5,0.65,0.76,0.79,2
    0.69,0.43,0.48,0.5,0.59,0.74,0.77,2
    0.79,0.36,0.48,0.5,0.46,0.82,0.7,2
    0.78,0.33,0.48,0.5,0.57,0.77,0.79,2
    0.75,0.37,0.48,0.5,0.64,0.7,0.74,2
    0.59,0.29,0.48,0.5,0.64,0.75,0.77,2
    0.67,0.37,0.48,0.5,0.54,0.64,0.68,2
    0.66,0.48,0.48,0.5,0.54,0.7,0.74,2
    0.64,0.46,0.48,0.5,0.48,0.73,0.76,2
    0.76,0.71,0.48,0.5,0.5,0.71,0.75,2
    0.84,0.49,0.48,0.5,0.55,0.78,0.74,2
    0.77,0.55,0.48,0.5,0.51,0.78,0.74,2
    0.81,0.44,0.48,0.5,0.42,0.67,0.68,2
    0.58,0.6,0.48,0.5,0.59,0.73,0.76,2
    0.63,0.42,0.48,0.5,0.48,0.77,0.8,2
    0.62,0.42,0.48,0.5,0.58,0.79,0.81,2
    0.86,0.39,0.48,0.5,0.59,0.89,0.9,2
    0.81,0.53,0.48,0.5,0.57,0.87,0.88,2
    0.87,0.49,0.48,0.5,0.61,0.76,0.79,2
    0.47,0.46,0.48,0.5,0.62,0.74,0.77,2
    0.76,0.41,0.48,0.5,0.5,0.59,0.62,2
    0.7,0.53,0.48,0.5,0.7,0.86,0.87,2
    0.64,0.45,0.48,0.5,0.67,0.61,0.66,2
    0.81,0.52,0.48,0.5,0.57,0.78,0.8,2
    0.73,0.26,0.48,0.5,0.57,0.75,0.78,2
    0.49,0.61,1,0.5,0.56,0.71,0.74,2
    0.88,0.42,0.48,0.5,0.52,0.73,0.75,2
    0.84,0.54,0.48,0.5,0.75,0.92,0.7,2
    0.63,0.51,0.48,0.5,0.64,0.72,0.76,2
    0.86,0.55,0.48,0.5,0.63,0.81,0.83,2
    0.79,0.54,0.48,0.5,0.5,0.66,0.68,2
    0.57,0.38,0.48,0.5,0.06,0.49,0.33,2
    0.78,0.44,0.48,0.5,0.45,0.73,0.68,2
    0.78,0.68,0.48,0.5,0.83,0.4,0.29,3
    0.63,0.69,0.48,0.5,0.65,0.41,0.28,3
    0.67,0.88,0.48,0.5,0.73,0.5,0.25,3
    0.61,0.75,0.48,0.5,0.51,0.33,0.33,3
    0.67,0.84,0.48,0.5,0.74,0.54,0.37,3
    0.74,0.9,0.48,0.5,0.57,0.53,0.29,3
    0.73,0.84,0.48,0.5,0.86,0.58,0.29,3
    0.75,0.76,0.48,0.5,0.83,0.57,0.3,3
    0.77,0.57,0.48,0.5,0.88,0.53,0.2,3
    0.74,0.78,0.48,0.5,0.75,0.54,0.15,3
    0.68,0.76,0.48,0.5,0.84,0.45,0.27,3
    0.56,0.68,0.48,0.5,0.77,0.36,0.45,3
    0.65,0.51,0.48,0.5,0.66,0.54,0.33,3
    0.52,0.81,0.48,0.5,0.72,0.38,0.38,3
    0.64,0.57,0.48,0.5,0.7,0.33,0.26,3
    0.6,0.76,1,0.5,0.77,0.59,0.52,3
    0.69,0.59,0.48,0.5,0.77,0.39,0.21,3
    0.63,0.49,0.48,0.5,0.79,0.45,0.28,3
    0.71,0.71,0.48,0.5,0.68,0.43,0.36,3
    0.68,0.63,0.48,0.5,0.73,0.4,0.3,3
    0.74,0.49,0.48,0.5,0.42,0.54,0.36,4
    0.7,0.61,0.48,0.5,0.56,0.52,0.43,4
    0.66,0.86,0.48,0.5,0.34,0.41,0.36,4
    0.73,0.78,0.48,0.5,0.58,0.51,0.31,4
    0.65,0.57,0.48,0.5,0.47,0.47,0.51,4
    0.72,0.86,0.48,0.5,0.17,0.55,0.21,4
    0.67,0.7,0.48,0.5,0.46,0.45,0.33,4
    0.67,0.81,0.48,0.5,0.54,0.49,0.23,4
    0.67,0.61,0.48,0.5,0.51,0.37,0.38,4
    0.63,1,0.48,0.5,0.35,0.51,0.49,4
    0.57,0.59,0.48,0.5,0.39,0.47,0.33,4
    0.71,0.71,0.48,0.5,0.4,0.54,0.39,4
    0.66,0.74,0.48,0.5,0.31,0.38,0.43,4
    0.67,0.81,0.48,0.5,0.25,0.42,0.25,4
    0.64,0.72,0.48,0.5,0.49,0.42,0.19,4
    0.68,0.82,0.48,0.5,0.38,0.65,0.56,4
    0.32,0.39,0.48,0.5,0.53,0.28,0.38,4
    0.7,0.64,0.48,0.5,0.47,0.51,0.47,4
    0.63,0.57,0.48,0.5,0.49,0.7,0.2,4
    0.69,0.65,0.48,0.5,0.63,0.48,0.41,4
    0.43,0.59,0.48,0.5,0.52,0.49,0.56,4
    0.74,0.56,0.48,0.5,0.47,0.68,0.3,4
    0.71,0.57,0.48,0.5,0.48,0.35,0.32,4
    0.61,0.6,0.48,0.5,0.44,0.39,0.38,4
    0.59,0.61,0.48,0.5,0.42,0.42,0.37,4
    0.74,0.74,0.48,0.5,0.31,0.53,0.52,4
    

    在采纳@AlexL的建议后,我查看了代码并开发了一个修改版本,其中包含以下两个功能:

    # This function returns the list of classes, and their associated weights (i.e. distributions)
    def class_distribution(dataset):
        dataset = numpy.asarray(dataset)
        num_total_rows = dataset.shape[0]
        num_columns = dataset.shape[1]
        classes = dataset[:, num_columns - 1]
        classes = numpy.unique(classes)
        class_weights = []
    
        # Loop through the classes one by one
        for aclass in classes:
            total = 0
            weight = 0
            for row in dataset:
                if numpy.array_equal(aclass, row[-1]):
                    total = total + 1
                else:
                    continue
            weight = float((total / num_total_rows))
            class_weights.append(weight)
    
        class_weights = numpy.asarray(class_weights)
        return classes, class_weights
    
    # This functions performs k cross fold validation for classification
    def cross_fold_validation_classification(dataset, k):
        temp_dataset = numpy.asarray(dataset)
        classes, class_weights = class_distribution(temp_dataset)
        total_num_rows = temp_dataset.shape[0]
        data = numpy.copy(temp_dataset)
        total_fold_array = []
    
        for _ in range(k):
            curr_fold_array = []
    
            # Loop through each class and its associated weight
            for a_class, a_class_weight in zip(classes, class_weights):
                numpy.random.shuffle(data)
                num_added = 0
                num_to_add = float((((a_class_weight * total_num_rows)) / k))
                tot = 0
                for row in data:
                    curr = row[-1]
                    if num_added >= num_to_add:
                        break
                    else:
                        if (a_class == curr):
                            curr_fold_array.append(row)
                            num_added = num_added + 1
                            numpy.delete(data, tot)
                    tot = tot + 1
            total_fold_array.append(curr_fold_array)
    
    return total_fold_array
    

    在采纳@AlexL的建议后,我查看了代码并开发了一个修改版本,其中包含以下两个功能:

    # This function returns the list of classes, and their associated weights (i.e. distributions)
    def class_distribution(dataset):
        dataset = numpy.asarray(dataset)
        num_total_rows = dataset.shape[0]
        num_columns = dataset.shape[1]
        classes = dataset[:, num_columns - 1]
        classes = numpy.unique(classes)
        class_weights = []
    
        # Loop through the classes one by one
        for aclass in classes:
            total = 0
            weight = 0
            for row in dataset:
                if numpy.array_equal(aclass, row[-1]):
                    total = total + 1
                else:
                    continue
            weight = float((total / num_total_rows))
            class_weights.append(weight)
    
        class_weights = numpy.asarray(class_weights)
        return classes, class_weights
    
    # This functions performs k cross fold validation for classification
    def cross_fold_validation_classification(dataset, k):
        temp_dataset = numpy.asarray(dataset)
        classes, class_weights = class_distribution(temp_dataset)
        total_num_rows = temp_dataset.shape[0]
        data = numpy.copy(temp_dataset)
        total_fold_array = []
    
        for _ in range(k):
            curr_fold_array = []
    
            # Loop through each class and its associated weight
            for a_class, a_class_weight in zip(classes, class_weights):
                numpy.random.shuffle(data)
                num_added = 0
                num_to_add = float((((a_class_weight * total_num_rows)) / k))
                tot = 0
                for row in data:
                    curr = row[-1]
                    if num_added >= num_to_add:
                        break
                    else:
                        if (a_class == curr):
                            curr_fold_array.append(row)
                            num_added = num_added + 1
                            numpy.delete(data, tot)
                    tot = tot + 1
            total_fold_array.append(curr_fold_array)
    
    return total_fold_array
    

    虽然你注定要独自去numpy,但我强烈建议你从scikit learn中搜索全班。这正好完成了您想要做的事情,在他们的代码(位于numpy之上)中看到这一点可能会有所帮助。他们利用了他们的
    KFold
    类,但你也可以挖掘出来。这是一个伟大的建议@AlexL,我有关于尝试这样做的建议。我猜我在哪里挣扎是因为没有足够的基础来真正了解他们在做什么,短的只是复制他们的代码,插上和玩,直到我得到我的工作,这不是很好的实践。在某些情况下,不一定坏复制和粘贴代码和修补它从那里。在这种情况下,我建议一行一行地进行。在上面的文档中,我认为638-642、654-663和665-676是这门课的重点。查看第430-443行,查看KFold的主要工作原理。祝你好运虽然你注定要独自去numpy,但我强烈建议你从scikit learn中搜索全班。这正好完成了您想要做的事情,在他们的代码(位于numpy之上)中看到这一点可能会有所帮助。他们利用了他们的
    KFold
    类,但你也可以挖掘出来。这是一个伟大的建议@AlexL,我有关于尝试这样做的建议。我猜我在哪里挣扎是因为没有足够的基础来真正了解他们在做什么,短的只是复制他们的代码,插上和玩,直到我得到我的工作,这不是很好的实践。在某些情况下,不一定坏复制和粘贴代码和修补它从那里。在这种情况下,我建议一行一行地进行。在上面的文档中,我认为638-642、654-663和665-676是这门课的重点。查看第430-443行,查看KFold的主要工作原理。祝你好运