Python 分类类型错误:';int';对象不可下标
我正在使用熵制作一个id3分类器,用于成本计算,但当我拆分分类属性时,我不断得到一个TypeError:当运行行:child=[x for x in records if x[attr_name]in a_I]时,在方法:split_category中,“int”对象不可下标 数据集: 桑尼,85,85,假,不 桑妮,80,90,对,不 阴,83,78,假,对 雨,70,96,假,是的 雨,68,80,假,是的 雨,65,70,真的,不 乌云密布,64,65,没错,是的 桑尼,72,95,假,不 桑妮,69,70,错,是的 雨,75,80,假,是的 桑妮,75岁,70岁,是的 乌云密布,72,90,没错,是的 阴,81,75,假,是的 雨,71,80,对,不 完整代码:Python 分类类型错误:';int';对象不可下标,python,split,classification,list-comprehension,Python,Split,Classification,List Comprehension,我正在使用熵制作一个id3分类器,用于成本计算,但当我拆分分类属性时,我不断得到一个TypeError:当运行行:child=[x for x in records if x[attr_name]in a_I]时,在方法:split_category中,“int”对象不可下标 数据集: 桑尼,85,85,假,不 桑妮,80,90,对,不 阴,83,78,假,对 雨,70,96,假,是的 雨,68,80,假,是的 雨,65,70,真的,不 乌云密布,64,65,没错,是的 桑尼,72,95,假,不
import csv
import math
from statistics import median, mode
from collections import Counter
from enum import Enum
class AttrType(Enum):
cat = 0 # categorical (qualitative) attribute
num = 1 # numerical (quantitative) attribute
target = 2 # target label
class NodeType(Enum):
root = 0
internal = 1
leaf = 2
class SplitType(Enum):
bin = 0 # binary split
multi = 1 # multi-way split
class Attribute(object):
def __init__(self, label, type):
assert type in AttrType
self.label = label
self.type = type
self.stat = None # holds mean for numerical and mode for categorical attributes
class Splitting(object):
def __init__(self, attr, infogain, split_type, cond, splits):
self.attr = attr # attribute ID (index in ATTR)
self.infogain = infogain # information gain if splitting is done on this attribute
self.split_type = split_type # one of SplitType
self.cond = cond # splitting condition, i.e., values on outgoing edges
self.splits = splits # list of training records (IDs) for each slitting condition
class Node(object):
def __init__(self, id, type, parent_id, children=None, edge_value=None, val=None, split_type=None, split_cond=None):
self.id = id # ID (same as the index in DT.model list)
self.type = type # one of NodeType
self.parent_id = parent_id # ID of parent node (None if root)
self.children = children # list of IDs of child nodes
self.edge_value = edge_value # the value of the incoming edge (only if not root node)
self.val = val # if root or internal node: the attribute that is compared at that node; if leaf node: the target value
self.split_type = split_type # one of SplitType
self.split_cond = split_cond # splitting condition (median value for binary splits on numerical values; otherwise a list of categorical values (corresponding to child nodes))
self.infogain = infogain
def append_child(self, node_id):
self.children.append(node_id)
# input filename and data format are hard-coded here
INFILE = "data/example.csv"
# attribute labels types (same order as in the file!)
ATTR = [Attribute("Outlook", AttrType.cat), Attribute("Temperature", AttrType.num),
Attribute("Humidity", AttrType.num), Attribute("Windy", AttrType.cat), Attribute("Play?", AttrType.target)]
IDX_TARGET = len(ATTR) - 1 # index of the target attribute (assuming it's the last)
#main class:
class DT(object):
def __init__(self):
self.data = None # training data set (loaded into memory)
self.model = None # decision tree model
self.default_class = None # default target class
def __load_data(self):
with open(INFILE) as csvfile:
self.data = []
csvreader = csv.reader(csvfile, delimiter=',')
for row in csvreader:
rec = []
for i in range(len(ATTR)):
val = row[i].strip()
# convert numerical attributes
if ATTR[i].type == AttrType.num: # Note that this will break for "?" (missing attribute)
val = float(val)
rec.append(val)
self.data.append(rec)
# self.data.append([element.strip() for element in row]) # strip spaces
def __entropy(self, records):
"""
Calculates entropy for a selection of records.
:param records: Data records (given by indices)
"""
# TODO records er en liste med index til hver record
#Entropy of a list of records associated with a node.
dat = {key:self.data[key] for key in records}
count = Counter([x[4] for x in dat.values()]) #target=4="Play?"
return sum([(-freq / len(dat)) * math.log(freq / len(dat), 2) for freq in count.values()])
#return sum([(-freq / len(self.data)) * math.log(freq / len(self.data), 2) for freq in records])
def split_categorical(self, records, attr_name, values_sets):
print("Splitting by {}".format(attr_name))
children = []
for a_i in values_sets: # for each subset of possible values\n",
child = [x for x in records if x[attr_name] in a_i]
children.append(child)
# e.g. if values_sets = [{\"sunny\"}, {\"overcast\", \"rain\"}], and atr_name = \"Outlook\"\n",
# then, in the 2nd iteration, a_i = {\"overcast\", \"rain\"},\n",
# so child = list of records for which the value in \"Outlook\" attr is in {\"overcast\", \"rain\"}\n",
# We also print the entropy for each child\n",
print("Child condition: {}Size = {}Entropy = {}".format(a_i, len(child), entropy(child)))
return children
def split_numeric_binary(self, records, attr_name, splitting_point):
print("Splitting by {}".format(attr_name))
children = [[x for x in records if x[attr_name] <= splitting_point],
[x for x in records if x[attr_name] > splitting_point]]
# We also print the entropy for each child
print("'Less-or-equal-than' child. Size = {}Entropy = {}".format(len(children[0]), entropy(children[0])))
print("'Greater-than' child. Size = {}Entropy = {}".format(len(children[1]), entropy(children[1])))
return children
def infogain(self, parent_records, children_records):
#param parent_records: list of records associated with the parent node.
#param children_records: list of lists, each list contains all the records associated with one child.
entropy_p = entropy(parent_records)
return entropy_p - sum([(len(child_records) / len(parent_records)) * entropy(child_records)
for child_records in children_records])
def __find_best_attr(self, attrs, records):
"""
Finds the attribute with the largest gain.
:param attrs: Set of attributes
:param records: Training set (list of record ids)
:return:
"""
entropy_p = self.__entropy(records) # parent's entropy
splittings = [] # holds the splitting information for each attribute
for a in attrs:
assert ATTR[a].type in AttrType
splits = {} # record IDs corresponding to each split
children = []
# splitting condition depends on the attribute type
if ATTR[a].type == AttrType.target: # skip target attribute
continue
elif ATTR[a].type == AttrType.cat: # categorical attribute
# multi-way split on each possible value
split_mode = SplitType.multi
# each possible attr value corresponds to a split (indexed with categorical labels)
# Note: it's important to consider attr values from the entire training set
split_cond = set([self.data[idx][a] for idx in range(len(self.data))])
# TODO collect training records for each split
# `splits[val]` holds a list of records for a given split,
# where `val` is an element of `split_cond`
#split_categorical og legg til resultat i splits
children = self.split_categorical(records,a,split_cond)
for i, val in enumerate(split_cond):
splits[val]=children[i] #get records for given split
elif ATTR[a].type == AttrType.num: # numerical attribute => binary split on median value
split_mode = SplitType.bin
split_cond = self.__median(a) # (i.e., if less or equal than this value)
# TODO collect training records for each split (in `splits`)
children = self.split_numeric_binary(records, a, split_cond)
for i, val in enumerate(split_cond):
splits[val]=children[i]
# TODO compute gain for attribute a
infogain = self.infogain(records, children)
splitting = Splitting(a, infogain, split_mode, split_cond, splits)
splittings.append(splitting)
# find best splitting
best_splitting = sorted(splittings, key=lambda x: x.infogain, reverse=True)[0]
return best_splitting
def __add_node(self, parent_id, node_type=NodeType.internal, edge_value=None, val=None, split_type=None,
split_cond=None):
"""
Adds a node to the decision tree.
:param parent_id:
:param node_type:
:param edge_value:
:param val:
:param split_type:
:param split_cond:
:return:
"""
node_id = len(self.model) # id of the newly assigned node
if not self.model: # the tree is empty
node_type = NodeType.root
node = Node(node_id, node_type, parent_id, children=[], edge_value=edge_value, val=val, split_type=split_type,
split_cond=split_cond)
self.model.append(node)
# also add it as a child of the parent node
if parent_id is not None:
self.model[parent_id].append_child(node_id)
return node_id
def __id3(self, attrs, records, parent_id=None, value=None):
"""
Function ID3 that returns a decision tree.
:param attrs: Set of attributes
:param records: Training set (list of record ids)
:param parent_id: ID of parent node
:param value: Value corresponding to the parent attribute, i.e., label of the edge on which we arrived to this node
:return:
"""
# empty training set or empty set of attributes => create leaf node with default class
if not records or not attrs:
self.__add_node(parent_id, node_type=NodeType.leaf, edge_value=value, val=self.default_class)
return
# if all records have the same target value => create leaf node with that target value
same = all(self.data[idx][IDX_TARGET] == self.data[records[0]][IDX_TARGET] for idx in records)
if same:
target = self.data[records[0]][IDX_TARGET]
self.__add_node(parent_id, node_type=NodeType.leaf, edge_value=value, val=target)
return
# find the attribute with the largest gain
splitting = self.__find_best_attr(attrs, records)
# add node
node_id = self.__add_node(parent_id, edge_value=value, val=splitting.attr, split_type=splitting.split_type,
split_cond=splitting.cond)
# TODO call tree construction recursively for each split
node = self.model[node_id]
for n_id in node.children:
self.__id3(attrs, records, node_id, node.val)
return self.model
def print_model(self, node_id=0, level=0):
node = self.model[node_id]
indent = " " * level
if node.type == NodeType.leaf:
print(indent + str(node.edge_value) + " [Leaf node] class=" + node.val)
else:
cond = " <= " + str(node.split_cond) if ATTR[node.val].type == AttrType.num else " == ? "
if node.type == NodeType.root:
print("[Root node] '" + ATTR[node.val].label + "'" + cond)
else:
print(indent + str(node.edge_value) + " [Internal node] '" + ATTR[node.val].label + "'" + cond)
# print tree for child notes recursively
for n_id in node.children:
self.print_model(n_id, level + 1)
def build_model(self):
self.__load_data()
#print(list(range(len(self.data))))
#print(list(range(len(self.data))))
self.model = [] # holds the decision tree model, represented as a list of nodes
# Get majority class
# Note: Counter returns a dictionary, most_common(x) returns a list with the x most common elements as
# (key, count) tuples; we need to take the first element of the list and the first element of the tuple
self.default_class = Counter([x[IDX_TARGET] for x in self.data]).most_common(1)[0][0]
self.__id3(set(range(len(ATTR) - 1)), list(range(len(self.data))))
def apply_model(self, record):
node = self.model[0]
while node.type != NodeType.leaf:
# TODO based on the value of the record's attribute that is tested in `node`,
# set `node` to one of its child nodes until a leaf node is reached
for n_id in node.children:
node = self.model[n_id]
return node.val
def main():
dt = DT()
print("Build model:")
dt.build_model()
dt.print_model()
print("\nApply model:")
print(dt.apply_model(['sunny', 85, 85, 'false']))
print(dt.apply_model(['overcast', 75, 85, 'true']))
print(dt.apply_model(['rain', 75, 85, 'false']))
if __name__ == "__main__":
导入csv
输入数学
从统计导入中值,模式
从收款进口柜台
从枚举导入枚举
类属性类型(枚举):
cat=0#分类(定性)属性
num=1#数字(定量)属性
目标=2#目标标签
类节点类型(枚举):
根=0
内部=1
叶=2
类拆分类型(枚举):
bin=0#二进制分割
多路=1#多路分割
类属性(对象):
定义初始化(自我、标签、类型):
属性类型中的断言类型
self.label=标签
self.type=type
self.stat=None#表示数值属性的平均值,表示分类属性的模式
类拆分(对象):
定义初始化(自身、属性、信息增益、拆分类型、条件、拆分):
self.attr=attr#属性ID(attr中的索引)
self.infogain=infogain#如果对该属性进行拆分,则信息增益
self.split_type=split_type#SplitType之一
self.cond=cond#分割条件,即输出边上的值
self.splits=splits#每个分切条件的培训记录(ID)列表
类节点(对象):
定义初始化(self、id、type、parent\u id、children=None、edge\u value=None、val=None、split\u type=None、split\u cond=None):
self.id=id#id(与DT.model列表中的索引相同)
self.type=type#节点类型之一
self.parent_id=parent_id#父节点的id(如果是root,则无)
self.children=children#子节点ID列表
self.edge_value=edge_value#传入边的值(仅当不是根节点时)
self.val=val#如果是根节点或内部节点:在该节点上比较的属性;如果叶节点:目标值
self.split_type=split_type#SplitType之一
self.split_cond=split_cond#spliting条件(数值上二进制分割的中值;否则为分类值列表(对应于子节点))
self.infogain=infogain
def append_子项(自身,节点id):
self.children.append(节点\u id)
#输入文件名和数据格式在此处硬编码
infle=“data/example.csv”
#属性标签类型(与文件中的顺序相同!)
ATTR=[属性(“Outlook”,AttrType.cat),属性(“Temperature”,AttrType.num),
属性(“湿度”,AttrType.num),属性(“风”,AttrType.cat),属性(“播放”,AttrType.target)]
IDX_TARGET=len(ATTR)-1#目标属性的索引(假设它是最后一个)
#主要类别:
DT类(对象):
定义初始化(自):
self.data=None#训练数据集(加载到内存中)
self.model=无#决策树模型
self.default_class=None#默认目标类
定义加载数据(自):
打开(填充)为csvfile时:
self.data=[]
csvreader=csv.reader(csvfile,分隔符=',')
对于csvreader中的行:
rec=[]
对于范围内的i(len(ATTR)):
val=行[i].strip()
#转换数值属性
如果ATTR[i].type==AttrType.num:#请注意,这将中断“”(缺少属性)
val=浮动(val)
记录附加(val)
self.data.append(rec)
#self.data.append([element.strip()表示行中的元素])#条带空间
定义熵(自、记录):
"""
计算选定记录的熵。
:参数记录:数据记录(由索引给出)
"""
#TODO记录在记录之前列出中间索引
#与节点关联的记录列表的熵。
dat={key:self.data[key]用于记录中的key}
count=计数器([x[4]表示数据值()中的x)#target=4=“播放?”
count.values()中freq的返回和([(-freq/len(dat))*math.log(freq/len(dat),2)
#返回和([(-freq/len(self.data))*math.log(freq/len(self.data),2)用于记录中的频率])
def split_分类(自身、记录、属性名称、值集):
打印(“按{}拆分”。格式(属性名称))
儿童=[]
对于值集中的值:#对于可能值的每个子集\n“,
child=[x代表记录中的x,如果a_i中的x[attr_name]
children.append(child)
#例如,如果值\u set=[{“sunny\”}、{“clowst\”、“rain\”}]和atr\u name=\“Outlook\”\n,
#然后,在第二次迭代中,a_i={“阴天”,“雨”}、\n“,
#so child=记录列表,其中\“Outlook\”attr中的值位于{\“clowst\,\“rain\”}\n“,
#我们还打印每个子项的熵\n“,
打印(“子条件:{}大小={}熵={}”。格式(a_i,len(Child),Entropy(Child)))
返回儿童
def分割数值二进制(自身、记录、属性名称、分割点):
打印(“按{}拆分”。格式(属性名称))
children=[[x表示x,如果x[attr\u name]正在拆分点]]
#我们还打印每个孩子的熵
打印(“'Less-or-equal-than'child.Size={}熵={}”。格式(len(children[0]),熵(children[0]))
打印(“'bether-than'child.Size={}Entrop)