Machine learning RuntimeError:张量的元素0不需要梯度,也没有梯度fn。训练火炬

Machine learning RuntimeError:张量的元素0不需要梯度,也没有梯度fn。训练火炬,machine-learning,nlp,classification,bert-language-model,huggingface-transformers,Machine Learning,Nlp,Classification,Bert Language Model,Huggingface Transformers,我正在使用来自haggingfaces(阿拉伯语ALBERT)的模型来执行文本分类任务,以下是我的代码快照: model_name = 'kuisailab/albert-base-arabic' num_labels = 2 task_name = 'classification' model_config = AutoConfig.from_pretrained(model_name,num_labels=num_labels) ##needed for the visualizatio

我正在使用来自haggingfaces(阿拉伯语ALBERT)的模型来执行文本分类任务,以下是我的代码快照:

model_name = 'kuisailab/albert-base-arabic' 
num_labels = 2
task_name = 'classification'

model_config = AutoConfig.from_pretrained(model_name,num_labels=num_labels) ##needed for the visualizations
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

for param in model.parameters():
    param.requires_grad = False

## Add three new layers at the end of the network
model.classifier = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, 2),
    nn.Softmax(dim=1)
)

model = model.to(device)


from transformers import BertForSequenceClassification, AdamW, BertConfig

criterion = nn.MSELoss().to(device)
##optimizer = optim.SGD(model.classifier.parameters(), lr=0.01)  acc=0.70
optimizer = AdamW(model.parameters(),
                  lr = 0.001  ,      ###lr = 2e-5  acc=0.75 ### learning rate 
                  eps = 1e-8
                )
以下函数用于文本预处理和标记化:

  def preprocess_text(text):
    parts = []

    text_len = len(text.split(' '))
    delta = 300
    max_parts = 5
    nb_cuts = int(text_len / delta)
    nb_cuts = min(nb_cuts, max_parts)
        
    for i in range(nb_cuts + 1):
        text_part = ' '.join(text.split(' ')[i * delta: (i + 1) * delta])
        parts.append(tokenizer.encode(text_part, return_tensors="pt", max_length=500).to(device))

    return parts
这是导致错误的训练循环代码

    print_every = 300

total_loss = 0
all_losses = []

CUDA_LAUNCH_BLOCKING=1

model.train()

for idx, row in train_data.iterrows():
    text_parts = preprocess_text(str(row['sentence']))
    label = torch.tensor([row['label']]).long().to(device)

    optimizer.zero_grad()
overall_output = torch.zeros((1, 2)).float().to(device)
for part in text_parts:
    if len(part) > 0:
        try:
            input = part.reshape(-1)[:512].reshape(1, -1)
            # print(input.shape)
            overall_output += model(input, labels=label)[1].float().to(device)
        except Exception as e:
            print(str(e))
`overall_output /= len(text_parts)`
overall_output = F.softmax(overall_output[0], dim=-1)

if label == 0:
    label = torch.tensor([1.0, 0.0]).float().to(device)
elif label == 1:
    label = torch.tensor([0.0, 1.0]).float().to(device)

# print(overall_output, label)

loss = criterion(overall_output, label)
total_loss += loss.item()

loss.backward()
optimizer.step()

if idx % print_every == 0 and idx > 0:
    average_loss = total_loss / print_every
    print("{}/{}. Average loss: {}".format(idx, len(train_data), average_loss))
    all_losses.append(average_loss)
    total_loss = 0

任何解决错误“RuntimeError:张量的元素0不需要grad且没有grad_fn”的建议

请发布完整的错误跟踪。