Tensorflow 在BERT-finetune中使用梯度累积的方法_Tensorflow_Optimization_Gradient_Accumulate

Tensorflow 在BERT-finetune中使用梯度累积的方法

tensorflow optimization

Tensorflow 在BERT-finetune中使用梯度累积的方法,tensorflow,optimization,gradient,accumulate,Tensorflow,Optimization,Gradient,Accumulate,我在做一个伯特·费恩特的曲子，我有OOM问题。我听说处理这个问题的一个好方法是使用“梯度累积”。下面是我的优化.py（包括渐变累积）来自未来导入绝对导入来自未来进口部来自未来导入打印功能进口稀土导入tensorflow作为tf 从tensorflow.python.training导入优化器从tensorflow.python.framework导入ops def create_优化器（丢失、初始lr、训练步骤数、预热步骤数、使用步骤数）： “”“创建优化器培训op.”“” 全局\u

我在做一个伯特·费恩特的曲子，我有OOM问题。我听说处理这个问题的一个好方法是使用“梯度累积”。下面是我的优化.py（包括渐变累积）

来自未来导入绝对导入
来自未来进口部
来自未来导入打印功能
进口稀土
导入tensorflow作为tf
从tensorflow.python.training导入优化器
从tensorflow.python.framework导入ops
def create_优化器（丢失、初始lr、训练步骤数、预热步骤数、使用步骤数）：
“”“创建优化器培训op.”“”
全局\u步骤=tf.train.get\u或\u create\u全局\u步骤（）
学习速率=tf.constant（value=init\u lr，shape=[]，dtype=tf.float32）
#实现学习速率的线性衰减。
学习率=tf.train.polyman\u衰减(
学习率，
全球行动计划，
列车步数，
结束学习率=0.0，
功率=1.0，
周期=假）
#实现线性预热。即，如果全局步骤<数值预热步骤
#学习率将为“全局步数/num步数预热步数*init\u lr”。
如果num_预热步骤：
全局步长int=tf.cast（全局步长，tf.int32）
预热步数int=tf.constant（num\u预热步数，dtype=tf.int32）
全局步骤浮点=tf.cast（全局步骤整数，tf.float32）
预热步骤浮点=tf.cast（预热步骤浮点，tf.float32）
预热百分比完成=全局步骤浮动/预热步骤浮动
预热学习率=初始学习率*预热完成百分比
is_warmup=tf.cast（全局_步骤_intfrom __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re
import tensorflow as tf



from tensorflow.python.training import optimizer
from tensorflow.python.framework import ops





def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = MultistepAdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

  tvars = tf.trainable_variables()
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op


class MultistepAdamWeightDecayOptimizer(optimizer.Optimizer):
  """A basic Adam optimizer that includes "correct" L2 weight decay."""

  def __init__(self,
               learning_rate,
               weight_decay_rate=0.0,
               beta_1=0.9,
               beta_2=0.999,
               n = 1,
               epsilon=1e-6,
               exclude_from_weight_decay=None,
               name="MultistepAdamWeightDecayOptimizer"):
    """Constructs a AdamWeightDecayOptimizer."""
    super(MultistepAdamWeightDecayOptimizer, self).__init__(False, name)

    self.learning_rate = learning_rate
    self.weight_decay_rate = weight_decay_rate
    self.beta_1 = beta_1
    self.beta_2 = beta_2
    self.epsilon = epsilon
    self._n = n

    self.exclude_from_weight_decay = exclude_from_weight_decay

    self._n_t = None


  def _prepare(self):
    super(MultistepAdamWeightDecayOptimizer, self)._prepare()
    self._n_t=tf.convert_to_tensor(self._n, name="n")

  def _create_slots(self,var_list):

    super(MultistepAdamWeightDecayOptimizer, self)._create_slots(var_list)
    first_var = min(var_list, key=lambda x: x.name)
    self._create_non_slot_variable(initial_value=0 if self._n == 1 else 1,
                                   name="iter",
                                   colocate_with=first_var)

    for v in var_list:
      self._zeros_slot(v,"grad_acc",self._name)

  def _get_iter_variable(self):
    if tf.contrib.eager.in_eager_mode():
      graph = None
    else:
      graph = tf.get_default_graph()
    return self._get_non_slot_variable("iter", graph=graph)


  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """See base class."""
    update_ops = []

    var_list = [v for g, v in grads_and_vars if g is not None]


    with ops.init_scope():
      self._create_slots(var_list)
    self._prepare()



    for(grad, param) in grads_and_vars:
      if grad is None or param is None:
        continue

      grad_acc = self.get_slot(param, "grad_acc")
      param_name = self._get_variable_name(params.name)


      m = tf.get_variable(name=param_name + "/adam_m", shape=param.shape.as_list(),
          dtype=tf.float32,trainable=False, initializer=tf.zeros_initializer())

      v = tf.get_variable(name =param_name + "/adam_v", shape=param.sahpe.as_list(),
          dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer())




      def _apply_adam(grad_acc, grad, param, m, v):
        total_grad = (grad_acc + grad) / tf.cast(self._n_t, grad.dtype)



        next_m = (
          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, total_grad))
        next_v = (
          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
                                                    tf.square(total_grad)))

        update = next_m / (tf.sqrt(next_v) + self.epsilon)
        if self._do_use_weight_decay(param_name):
          update += self.weight_decay_rate * param
        update_with_lr =self.learning_rate * update
        next_param = param - update_with_lr
        adam_op = tf.group(param.assign(next_param), m.assign(next_m),
                       v.assign(next_v))
        with tf.control_dependencies([adam_op]):
           grad_acc_to_zero_op = grad_acc.assign(tf.zero_like(grad_acc), use_locking=self._use_locking)
        return tf.group(adam_op, grad_acc_to_zero_op)

      def _accumulate_gradient(grad_acc, grad):
         assign_up = tf.assign_add(grad_acc, grad, use_locking=self._use_locking)
         return tf.group(assign_op)

      update_op = tf.cond(tf.equal(self._get_iter_variable(),0),
                     lambda: _apply_adam(grad_acc, grad, param,m, v),
                     lambda: _accumulate_gradient(grad_acc, grad))
      update_ops.append(update_op)

    apply_updates = self._finish(update_ops, name_scope=name)
    return apply_updates

  def _finish(self, update_ops, name_scope):
    iter_=self._get_iter_variable()
    with tf.control_dependencies(update_ops):
      with tf.colocate_with(iter_):
        update_iter = iter_.assign(tf.mod(iter_+1, self._n_t),
                                      use_locking=self._use_locking)

    return tf.group(
           *update_ops + [update_iter], name=name_scope)

  def _do_use_weight_decay(self, param_name):
    """Whether to use L2 weight decay for `param_name`."""
    if not self.weight_decay_rate:
      return False
    if self.exclude_from_weight_decay:
      for r in self.exclude_from_weight_decay:
        if re.search(r, param_name) is not None:
          return False
    return True

  def _get_variable_name(self, param_name):
    """Get the variable name from the tensor name."""
    m = re.match("^(.*):\\d+$", param_name)
    if m is not None:
      param_name = m.group(1)
    return param_name