Machine learning 将野生权重应用于自定义损失函数不会';不要改变行为
我正在用pytorch geometric中的图形神经网络回归粒子特性。我想写一个自定义的损失函数,这样我可以使模型特别注意某些粒子。(上下文:我希望模型特别关注低能粒子) 在尝试合理的重量后,我看不到行为上的变化。我现在尝试应用权重w=[500,0.1],其中500应用于我希望模型敏感的情况,0.1应用于我希望模型不太关心的情况。我最初的想法是,这些重量太大了,应该会引起行为上的改变——但事实并非如此!结果与设置w=1相同 我花了数周时间试图找出问题所在——模型应该有足够的500个案例来关注它们 我可以肯定的是:Machine learning 将野生权重应用于自定义损失函数不会';不要改变行为,machine-learning,pytorch,loss-function,Machine Learning,Pytorch,Loss Function,我正在用pytorch geometric中的图形神经网络回归粒子特性。我想写一个自定义的损失函数,这样我可以使模型特别注意某些粒子。(上下文:我希望模型特别关注低能粒子) 在尝试合理的重量后,我看不到行为上的变化。我现在尝试应用权重w=[500,0.1],其中500应用于我希望模型敏感的情况,0.1应用于我希望模型不太关心的情况。我最初的想法是,这些重量太大了,应该会引起行为上的改变——但事实并非如此!结果与设置w=1相同 我花了数周时间试图找出问题所在——模型应该有足够的500个案例来关注它
def LossFunc(output,target,weight):
loss = torch.sum(torch.abs(weight*(output-target)))
return loss
def KNNAmp(k,x,batch):
pos = x[:,0:3]
edge_index = knn_graph(x=pos,k=k,batch=batch).to(device)
return x,edge_index
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
l1, l2, l3, l4, l5,l6,l7 = 14,16*2,32*2,42*2,32*2,16*2,1
self.nn_conv1 = torch.nn.Sequential(torch.nn.Linear(l1*2,l2),torch.nn.ReLU(),torch.nn.Linear(l2,l3),torch.nn.ReLU()).to(device)
self.conv_max = EdgeConv(self.nn_conv1,aggr = 'max')
self.conv_mean = EdgeConv(self.nn_conv1,aggr = 'mean')
self.conv_add = EdgeConv(self.nn_conv1,aggr = 'add')
self.nn1 = torch.nn.Linear(3*l3,l4)
self.nn2 = torch.nn.Linear(l4,l5)
self.nn3 = torch.nn.Linear(4*l5,l6)
self.nn4 = torch.nn.Linear(l6,l7)
self.relu = torch.nn.LeakyReLU()
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch
x,edge_index = KNNAmp(4, x, batch)
x_max = self.conv_max(x,edge_index)
x_mean = self.conv_mean(x,edge_index)
x_add = self.conv_add(x,edge_index)
x = self.relu(torch.cat((x_max,x_mean,x_add),dim=1))
x = self.nn1(x)
x = self.relu(x)
x = self.nn2(x)
a,_ = scatter_max(x, batch, dim = 0)
b,_ = scatter_min(x, batch, dim = 0)
c = scatter_sum(x,batch,dim = 0)
d = scatter_mean(x,batch,dim= 0)
x = torch.cat((a,b,c,d),dim = 1)
x = self.relu(x)
x = self.nn3(x)
x = self.relu(x)
x = self.nn4(x)
return x
def Train(model_to_train,graphs_train,device,lr,lr_list,n_workers,batch_size,loss_decimals,graphs_valid,mini_batches_valid):
count = torch.tensor([0]).to(device)
model = model_to_train.to(device)
loss_func = LossFunc
count = torch.tensor(0).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr,eps = 1e-3)
print('TRAINING BEGUN. FIRST EPOCH STARTING..')
for epoch in range(0,n_epochs):
loss_acc = torch.tensor([0],dtype = float).to(device)
for k in range(0,mini_batches):
data_train = GrabBatch(q)
with torch.enable_grad():
data_train.y = data_train.y[:,0].unsqueeze(1)
model.train()
w = data_train.lw
optimizer.zero_grad()
out = model(data_train)
loss = loss_func(out, data_train.y.float(),w)
loss.backward()
optimizer.step()
optimizer.param_groups[0]['lr'] = lr_list[count].item()
count +=1
loss_acc +=loss
with torch.no_grad():
val_loss = Validate(model,graphs_valid,graphs_train,batch_size,o,
mini_batches_valid,mini_batches,manager,epoch,n_epochs,save = False)
if epoch == 0:
deltatime = (time.time() - start)/60
print('EPOCH: %s / %s || %s / %s min || LR: %s || Loss: %s || Val Loss: %s' %(epoch,n_epochs,(time.time() - start)/60,n_epochs*deltatime,optimizer.param_groups[0]['lr'],round(loss_acc.item()/mini_batches,loss_decimals),round(val_loss.item()/mini_batches_valid,loss_decimals)))
return model
def Validate(trained_model,graphs_valid,graphs_train,batch_size,
o,q,mini_batches_valid,val_slaves,n_workers,mini_batches,manager,train_epoch,n_epochs,save):
print('in validation')
model = trained_model
loss_func = LossFunc()
model.eval()
count = 0
print('Validating..')
acc_loss = torch.tensor([0],dtype = float).to(device)
with torch.no_grad():
for i in range(0, mini_batches_valid):
count = count + 1
data_pred = GrabBatch(q)
data_pred.y = data_pred.y[:,0].unsqueeze(1)
pred = model(data_pred)
w = data_pred.lw
loss = loss_func(pred,data_pred.y,w)
acc_loss += loss
val_loss = acc_loss
return val_loss
数据“data_pred”和“data_train”是一个几何“数据对象”,我通过一个定制的并行加载器加载到内存中,我已经构建了这个加载器,上面的代码没有提到它,因为我认为它是不相关的。它们各自包含1024个粒子(例如批次大小=1024)。他们将设备设置为GPU
你能告诉我我在什么地方破坏了反向传播吗?你能看到有什么明显的错误吗
非常感谢