Python 在Tensorflow中执行梯度累积时内存不足

Python 在Tensorflow中执行梯度累积时内存不足,python,tensorflow,keras,Python,Tensorflow,Keras,我试图使用HuggingFace的BERT模型为twitter情绪分析模型实现梯度累积。然而,当我以64的批大小实现梯度累积时,我得到了可怕的“OOM”错误。奇怪的是,当我以64的批量运行同一个模型,而不使用梯度累加时,它的训练就完成了。有人知道这是为什么和/或我的代码是否错误吗 batch_size = 32 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) vocabular

我试图使用HuggingFace的BERT模型为twitter情绪分析模型实现梯度累积。然而,当我以64的批大小实现梯度累积时,我得到了可怕的“OOM”错误。奇怪的是,当我以64的批量运行同一个模型,而不使用梯度累加时,它的训练就完成了。有人知道这是为什么和/或我的代码是否错误吗


batch_size = 32
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
vocabulary = tokenizer.get_vocab()
optimizer = tf.keras.optimizers.Adam() 
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# data preprocessing
tweets_pos = pd.read_csv('C:/1_Tweets.csv', sep=',', names = ['Tweet', 'Sentiment'])
tweets_neg = pd.read_csv('C:/0_Tweets.csv', sep=',', names = ['Tweet',  'Sentiment'])
data = pd.concat([tweets_pos, tweets_neg], axis=0)
data = data.sample(frac=1) 
all_tweets = data['Tweet'].to_list()
all_sentiment = data['Sentiment'].to_list()
training_tweets = all_tweets[0:512]
training_labels = all_sentiment[0:512]

# create dataset
def create_dataset(tweets, labels):
  
  inputs_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  
  for i in range(len(tweets)):
    encoded = tokenizer.encode_plus(tweets[i], max_length = 512, pad_to_max_length=True, return_attention_mask=True, add_special_tokens=True)
    inputs_ids_list.append(encoded['input_ids'])
    token_type_ids_list.append(encoded['token_type_ids'])
    attention_mask_list.append(encoded['attention_mask'])
    label_list.append([labels[i]])

  ids_and_mask = {'input_ids':inputs_ids_list, 'token_type_ids':token_type_ids_list,'attention_mask':attention_mask_list}
  
  return tf.data.Dataset.from_tensor_slices((ids_and_mask, label_list))

# create dataset of batch_size = 32
train_dataset = create_dataset(training_tweets, training_labels).batch(batch_size)

# Accumulate Gradients
num_epochs = 1
for i in range(num_epochs):
  print(f'Epoch: {i + 1}')
  total_loss = 0

  # get trainable variables
  train_vars = model.trainable_variables
  accum_gradient = [tf.zeros_like(this_var) for this_var in train_vars]

  for (batch, (tweets, labels)) in enumerate(train_dataset):
      labels = tf.dtypes.cast(labels, tf.float32)
      with tf.GradientTape() as tape:
          prediction = model(tweets, training=True)
          prediction = tf.dtypes.cast(prediction, tf.float32)
          loss_value = loss(y_true=labels, y_pred=prediction)
      total_loss += loss_value

      # get gradients of this tape
      gradients = tape.gradient(loss_value, train_vars)
      # Accumulate the gradients
      accum_gradient = [(acum_grad+grad) for acum_grad, grad in zip(accum_gradient, gradients)]


  # average gradients and apply the optimization step
  accum_gradient = [this_grad/batch_size for this_grad in accum_gradient]
  optimizer.apply_gradients(zip(accum_gradient,train_vars))
      
  epoch_loss = total_loss / batch_size
  print(f'Epoch loss: {epoch_loss}')