Python 分类类型错误：'；int'；对象不可下标_Python_Split_Classification_List Comprehension

Python 分类类型错误：'；int'；对象不可下标

python

Python 分类类型错误：'；int'；对象不可下标,python,split,classification,list-comprehension,Python,Split,Classification,List Comprehension,我正在使用熵制作一个id3分类器，用于成本计算，但当我拆分分类属性时，我不断得到一个TypeError：当运行行：child=[x for x in records if x[attr_name]in a_I]时，在方法：split_category中，“int”对象不可下标数据集：桑尼，85，85，假，不桑妮，80，90，对，不阴，83，78，假，对雨，70，96，假，是的雨，68，80，假，是的雨，65，70，真的，不乌云密布，64，65，没错，是的桑尼，72，95，假，不

我正在使用熵制作一个id3分类器，用于成本计算，但当我拆分分类属性时，我不断得到一个TypeError：当运行行：child=[x for x in records if x[attr_name]in a_I]时，在方法：split_category中，“int”对象不可下标

数据集：

桑尼，85，85，假，不桑妮，80，90，对，不阴，83，78，假，对雨，70，96，假，是的雨，68，80，假，是的雨，65，70，真的，不乌云密布，64，65，没错，是的桑尼，72，95，假，不桑妮，69，70，错，是的雨，75，80，假，是的桑妮，75岁，70岁，是的乌云密布，72，90，没错，是的阴，81，75，假，是的雨，71，80，对，不

完整代码：

import csv
import math
from statistics import median, mode
from collections import Counter
from enum import Enum


class AttrType(Enum):
    cat = 0  # categorical (qualitative) attribute
    num = 1  # numerical (quantitative) attribute
    target = 2  # target label


class NodeType(Enum):
    root = 0
    internal = 1
    leaf = 2


class SplitType(Enum):
    bin = 0  # binary split
    multi = 1  # multi-way split


class Attribute(object):
    def __init__(self, label, type):
        assert type in AttrType
        self.label = label
        self.type = type
        self.stat = None  # holds mean for numerical and mode for categorical attributes


class Splitting(object):
    def __init__(self, attr, infogain, split_type, cond, splits):
        self.attr = attr  # attribute ID (index in ATTR)
        self.infogain = infogain  # information gain if splitting is done on this attribute
        self.split_type = split_type  # one of SplitType
        self.cond = cond  # splitting condition, i.e., values on outgoing edges
        self.splits = splits  # list of training records (IDs) for each slitting condition


class Node(object):
    def __init__(self, id, type, parent_id, children=None, edge_value=None, val=None, split_type=None, split_cond=None):
        self.id = id  # ID (same as the index in DT.model list)
        self.type = type  # one of NodeType
        self.parent_id = parent_id  # ID of parent node (None if root)
        self.children = children  # list of IDs of child nodes
        self.edge_value = edge_value  # the value of the incoming edge (only if not root node)
        self.val = val  # if root or internal node: the attribute that is compared at that node; if leaf node: the target value
        self.split_type = split_type  # one of SplitType
        self.split_cond = split_cond  # splitting condition (median value for binary splits on numerical values; otherwise a list of categorical values (corresponding to child nodes))
        self.infogain = infogain

    def append_child(self, node_id):
        self.children.append(node_id)


# input filename and data format are hard-coded here
INFILE = "data/example.csv"
# attribute labels types (same order as in the file!)
ATTR = [Attribute("Outlook", AttrType.cat), Attribute("Temperature", AttrType.num),
        Attribute("Humidity", AttrType.num), Attribute("Windy", AttrType.cat), Attribute("Play?", AttrType.target)]
IDX_TARGET = len(ATTR) - 1  # index of the target attribute (assuming it's the last)


#main class:
class DT(object):
    def __init__(self):
        self.data = None  # training data set (loaded into memory)
        self.model = None  # decision tree model
        self.default_class = None  # default target class

    def __load_data(self):
        with open(INFILE) as csvfile:
            self.data = []
            csvreader = csv.reader(csvfile, delimiter=',')
            for row in csvreader:
                rec = []
                for i in range(len(ATTR)):
                    val = row[i].strip()
                    # convert numerical attributes
                    if ATTR[i].type == AttrType.num:  # Note that this will break for "?" (missing attribute)
                        val = float(val)
                    rec.append(val)
                self.data.append(rec)
                # self.data.append([element.strip() for element in row])  # strip spaces

    def __entropy(self, records):
        """
        Calculates entropy for a selection of records.

        :param records: Data records (given by indices)
        """
        # TODO          records er en liste med index til hver record
        #Entropy of a list of records associated with a node. 

        dat = {key:self.data[key] for key in records}
        count = Counter([x[4] for x in dat.values()]) #target=4="Play?"
        return sum([(-freq / len(dat)) * math.log(freq / len(dat), 2) for freq in count.values()])
        #return sum([(-freq / len(self.data)) * math.log(freq / len(self.data), 2) for freq in records])
    def split_categorical(self, records, attr_name, values_sets):
        print("Splitting by {}".format(attr_name))
        children = []
        for a_i in values_sets:  # for each subset of possible values\n",
            child = [x for x in records if x[attr_name] in a_i]  
            children.append(child)
            # e.g. if values_sets = [{\"sunny\"}, {\"overcast\", \"rain\"}], and atr_name = \"Outlook\"\n",
            # then, in the 2nd iteration, a_i = {\"overcast\", \"rain\"},\n",
            # so child = list of records for which the value in \"Outlook\" attr is in {\"overcast\", \"rain\"}\n",

            # We also print the entropy for each child\n",
        print("Child condition: {}Size = {}Entropy = {}".format(a_i, len(child), entropy(child)))
        return children
    def split_numeric_binary(self, records, attr_name, splitting_point):
        print("Splitting by {}".format(attr_name))
        children = [[x for x in records if x[attr_name] <= splitting_point],
                   [x for x in records if x[attr_name] > splitting_point]]
        # We also print the entropy for each child
        print("'Less-or-equal-than' child. Size = {}Entropy = {}".format(len(children[0]), entropy(children[0])))
        print("'Greater-than' child. Size = {}Entropy = {}".format(len(children[1]), entropy(children[1])))
        return children
    def infogain(self, parent_records, children_records):
        #param parent_records: list of records associated with the parent node.
        #param children_records: list of lists, each list contains all the records associated with one child.
        entropy_p = entropy(parent_records)
        return entropy_p - sum([(len(child_records) / len(parent_records)) * entropy(child_records)
                                    for child_records in children_records])
    def __find_best_attr(self, attrs, records):
        """
        Finds the attribute with the largest gain.

        :param attrs: Set of attributes
        :param records: Training set (list of record ids)
        :return:
        """
        entropy_p = self.__entropy(records)  # parent's entropy
        splittings = []  # holds the splitting information for each attribute

        for a in attrs:
            assert ATTR[a].type in AttrType
            splits = {}  # record IDs corresponding to each split
            children = []
            # splitting condition depends on the attribute type
            if ATTR[a].type == AttrType.target:  # skip target attribute
                continue
            elif ATTR[a].type == AttrType.cat:  # categorical attribute
                # multi-way split on each possible value
                split_mode = SplitType.multi
                # each possible attr value corresponds to a split (indexed with categorical labels)
                # Note: it's important to consider attr values from the entire training set
                split_cond = set([self.data[idx][a] for idx in range(len(self.data))])

                # TODO collect training records for each split 
                # `splits[val]` holds a list of records for a given split,
                # where `val` is an element of `split_cond`
                #split_categorical og legg til resultat i splits
                children = self.split_categorical(records,a,split_cond)
                for i, val in enumerate(split_cond):
                    splits[val]=children[i] #get records for given split
            elif ATTR[a].type == AttrType.num:  # numerical attribute => binary split on median value
                split_mode = SplitType.bin
                split_cond = self.__median(a)  # (i.e., if less or equal than this value)
                # TODO collect training records for each split (in `splits`)
                children = self.split_numeric_binary(records, a, split_cond)
                for i, val in enumerate(split_cond):
                    splits[val]=children[i]
            # TODO compute gain for attribute a
            infogain = self.infogain(records, children)

            splitting = Splitting(a, infogain, split_mode, split_cond, splits)
            splittings.append(splitting)

        # find best splitting
        best_splitting = sorted(splittings, key=lambda x: x.infogain, reverse=True)[0]
        return best_splitting

    def __add_node(self, parent_id, node_type=NodeType.internal, edge_value=None, val=None, split_type=None,
                   split_cond=None):
        """
        Adds a node to the decision tree.

        :param parent_id:
        :param node_type:
        :param edge_value:
        :param val:
        :param split_type:
        :param split_cond:
        :return:
        """
        node_id = len(self.model)  # id of the newly assigned node
        if not self.model:  # the tree is empty
            node_type = NodeType.root

        node = Node(node_id, node_type, parent_id, children=[], edge_value=edge_value, val=val, split_type=split_type,
                    split_cond=split_cond)
        self.model.append(node)

        # also add it as a child of the parent node
        if parent_id is not None:
            self.model[parent_id].append_child(node_id)

        return node_id

    def __id3(self, attrs, records, parent_id=None, value=None):
        """
        Function ID3 that returns a decision tree.

        :param attrs: Set of attributes
        :param records: Training set (list of record ids)
        :param parent_id: ID of parent node
        :param value: Value corresponding to the parent attribute, i.e., label of the edge on which we arrived to this node
        :return:
        """
        # empty training set or empty set of attributes => create leaf node with default class
        if not records or not attrs:
            self.__add_node(parent_id, node_type=NodeType.leaf, edge_value=value, val=self.default_class)
            return

        # if all records have the same target value => create leaf node with that target value
        same = all(self.data[idx][IDX_TARGET] == self.data[records[0]][IDX_TARGET] for idx in records)
        if same:
            target = self.data[records[0]][IDX_TARGET]
            self.__add_node(parent_id, node_type=NodeType.leaf, edge_value=value, val=target)
            return

        # find the attribute with the largest gain
        splitting = self.__find_best_attr(attrs, records)
        # add node
        node_id = self.__add_node(parent_id, edge_value=value, val=splitting.attr, split_type=splitting.split_type,
                                  split_cond=splitting.cond)
        # TODO call tree construction recursively for each split
        node = self.model[node_id]
        for n_id in node.children:
            self.__id3(attrs, records, node_id, node.val)
        return self.model
    def print_model(self, node_id=0, level=0):
        node = self.model[node_id]
        indent = "  " * level
        if node.type == NodeType.leaf:
            print(indent + str(node.edge_value) + " [Leaf node] class=" + node.val)
        else:
            cond = " <= " + str(node.split_cond) if ATTR[node.val].type == AttrType.num else " == ? "
            if node.type == NodeType.root:
                print("[Root node] '" + ATTR[node.val].label + "'" + cond)
            else:
                print(indent + str(node.edge_value) + " [Internal node] '" + ATTR[node.val].label + "'" + cond)
            # print tree for child notes recursively
            for n_id in node.children:
                self.print_model(n_id, level + 1)

    def build_model(self):
        self.__load_data()
        #print(list(range(len(self.data))))
        #print(list(range(len(self.data))))
        self.model = []  # holds the decision tree model, represented as a list of nodes
        # Get majority class
        #   Note: Counter returns a dictionary, most_common(x) returns a list with the x most common elements as
        #         (key, count) tuples; we need to take the first element of the list and the first element of the tuple
        self.default_class = Counter([x[IDX_TARGET] for x in self.data]).most_common(1)[0][0]
        self.__id3(set(range(len(ATTR) - 1)), list(range(len(self.data))))

    def apply_model(self, record):
        node = self.model[0]
        while node.type != NodeType.leaf:
            # TODO based on the value of the record's attribute that is tested in `node`,
            # set `node` to one of its child nodes until a leaf node is reached
            for n_id in node.children:
                node = self.model[n_id]
        return node.val

def main():
    dt = DT()
    print("Build model:")
    dt.build_model()
    dt.print_model()

    print("\nApply model:")
    print(dt.apply_model(['sunny', 85, 85, 'false']))
    print(dt.apply_model(['overcast', 75, 85, 'true']))
    print(dt.apply_model(['rain', 75, 85, 'false']))

if __name__ == "__main__":

导入csv
输入数学
从统计导入中值，模式
从收款进口柜台
从枚举导入枚举
类属性类型（枚举）：
cat=0#分类（定性）属性
num=1#数字（定量）属性
目标=2#目标标签
类节点类型（枚举）：
根=0
内部=1
叶=2
类拆分类型（枚举）：
bin=0#二进制分割
多路=1#多路分割
类属性（对象）：
定义初始化（自我、标签、类型）：
属性类型中的断言类型
self.label=标签
self.type=type
self.stat=None#表示数值属性的平均值，表示分类属性的模式
类拆分（对象）：
定义初始化（自身、属性、信息增益、拆分类型、条件、拆分）：
self.attr=attr#属性ID（attr中的索引）
self.infogain=infogain#如果对该属性进行拆分，则信息增益
self.split_type=split_type#SplitType之一
self.cond=cond#分割条件，即输出边上的值
self.splits=splits#每个分切条件的培训记录（ID）列表
类节点（对象）：
定义初始化（self、id、type、parent\u id、children=None、edge\u value=None、val=None、split\u type=None、split\u cond=None）：
self.id=id#id（与DT.model列表中的索引相同）
self.type=type#节点类型之一
self.parent_id=parent_id#父节点的id（如果是root，则无）
self.children=children#子节点ID列表
self.edge_value=edge_value#传入边的值（仅当不是根节点时）
self.val=val#如果是根节点或内部节点：在该节点上比较的属性；如果叶节点：目标值
self.split_type=split_type#SplitType之一
self.split_cond=split_cond#spliting条件（数值上二进制分割的中值；否则为分类值列表（对应于子节点））
self.infogain=infogain
def append_子项（自身，节点id）：
self.children.append（节点\u id）
#输入文件名和数据格式在此处硬编码
infle=“data/example.csv”
#属性标签类型（与文件中的顺序相同！）
ATTR=[属性（“Outlook”，AttrType.cat），属性（“Temperature”，AttrType.num），
属性（“湿度”，AttrType.num），属性（“风”，AttrType.cat），属性（“播放”，AttrType.target）]
IDX_TARGET=len（ATTR）-1#目标属性的索引（假设它是最后一个）
#主要类别：
DT类（对象）：
定义初始化（自）：
self.data=None#训练数据集（加载到内存中）
self.model=无#决策树模型
self.default_class=None#默认目标类
定义加载数据（自）：
打开（填充）为csvfile时：
self.data=[]
csvreader=csv.reader（csvfile，分隔符='，'）
对于csvreader中的行：
rec=[]
对于范围内的i（len（ATTR））：
val=行[i].strip（）
#转换数值属性
如果ATTR[i].type==AttrType.num:#请注意，这将中断“”（缺少属性）
val=浮动（val）
记录附加（val）
self.data.append（rec）
#self.data.append（[element.strip（）表示行中的元素]）#条带空间
定义熵（自、记录）：
"""
计算选定记录的熵。
：参数记录：数据记录（由索引给出）
"""
#TODO记录在记录之前列出中间索引
#与节点关联的记录列表的熵。
dat={key:self.data[key]用于记录中的key}
count=计数器（[x[4]表示数据值（）中的x）#target=4=“播放？”
count.values（）中freq的返回和（[（-freq/len（dat））*math.log（freq/len（dat），2）
#返回和（[（-freq/len（self.data））*math.log（freq/len（self.data），2）用于记录中的频率]）
def split_分类（自身、记录、属性名称、值集）：
打印（“按{}拆分”。格式（属性名称））
儿童=[]
对于值集中的值：#对于可能值的每个子集\n“，
child=[x代表记录中的x，如果a_i中的x[attr_name]
children.append（child）
#例如，如果值\u set=[{“sunny\”}、{“clowst\”、“rain\”}]和atr\u name=\“Outlook\”\n，
#然后，在第二次迭代中，a_i={“阴天”，“雨”}、\n“，
#so child=记录列表，其中\“Outlook\”attr中的值位于{\“clowst\，\“rain\”}\n“，
#我们还打印每个子项的熵\n“，
打印（“子条件：{}大小={}熵={}”。格式（a_i，len（Child），Entropy（Child）））
返回儿童
def分割数值二进制（自身、记录、属性名称、分割点）：
打印（“按{}拆分”。格式（属性名称））
children=[[x表示x，如果x[attr\u name]正在拆分点]]
#我们还打印每个孩子的熵
打印（“'Less-or-equal-than'child.Size={}熵={}”。格式（len（children[0]），熵（children[0]））
打印（“'bether-than'child.Size={}Entrop）