Python 如何在gpu集群（多gpu）上训练CNN？_Python_Tensorflow_Cluster Computing_Multi Gpu

Python 如何在gpu集群（多gpu）上训练CNN？

python tensorflow cluster-computing

Python 如何在gpu集群（多gpu）上训练CNN？,python,tensorflow,cluster-computing,multi-gpu,Python,Tensorflow,Cluster Computing,Multi Gpu,我使用下面的代码在单个GPU上使用UCF101数据集训练CNN，但由于数据集的大小，训练需要很长时间 def _get_data_label_from_info(train_info_tensor, name, mode): """ Wrapper for `tf.py_func`, get video clip and label from info list.""" clip_holder, label_holder

我使用下面的代码在单个GPU上使用UCF101数据集训练CNN，但由于数据集的大小，训练需要很长时间

def _get_data_label_from_info(train_info_tensor, name, mode):
    """ Wrapper for `tf.py_func`, get video clip and label from info list."""
    clip_holder, label_holder = tf.py_func(
        process_video, [train_info_tensor, name, mode], [tf.float32, tf.int64])
    return clip_holder, label_holder


def process_video(data_info, name, mode, is_training=True):
    """ Get video clip and label from data info list."""
    data = Action_Dataset(name, mode, [data_info])
    if is_training:
        clip_seq, label_seq = data.next_batch(1, _CLIP_SIZE)
    else:
        clip_seq, label_seq = data.next_batch(
            1, _EACH_VIDEO_TEST_SIZE+1, shuffle=False, data_augment=False)
    clip_seq = 2*(clip_seq/255) - 1
    clip_seq = np.array(clip_seq, dtype='float32')
    return clip_seq, label_seq


def main(dataset='ucf101', mode='rgb', split=1):
    assert mode in ['rgb', 'flow'], 'Only RGB data and flow data is supported'
    log_dir = os.path.join(_LOG_ROOT, 'finetune-%s-%s-%d' %
                           (dataset, mode, split))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    logging.basicConfig(level=logging.INFO, filename=os.path.join(log_dir, 'log.txt'),
                        filemode='w', format='%(message)s')

    ##  Data Preload  ###
    train_info, test_info = split_data(
        os.path.join('./data', dataset, mode+'.txt'),
        os.path.join('./data', dataset, 'testlist%02d' % split+'.txt'))
#        os.path.join('/data1/yunfeng/i3d_test/data', dataset, mode+'.txt'),
#        os.path.join('/data1/yunfeng/i3d_test/data', dataset, 'testlist%02d' % split+'.txt'))
    train_data = Action_Dataset(dataset, mode, train_info)
    test_data = Action_Dataset(dataset, mode, test_info)

    num_train_sample = len(train_info)
    # Every element in train_info is shown as below:
    # ['v_ApplyEyeMakeup_g08_c01',
    # '/data4/zhouhao/dataset/ucf101/jpegs_256/v_ApplyEyeMakeup_g08_c01',
    # '121', '0']
    train_info_tensor = tf.constant(train_info)
    test_info_tensor = tf.constant(test_info)

    # Dataset building
    # Phase 1 Trainning
    # one element in this dataset is (train_info list)
    train_info_dataset = tf.data.Dataset.from_tensor_slices(
        (train_info_tensor))
    # one element in this dataset is (single image_postprocess, single label)
    # one element in this dataset is (batch image_postprocess, batch label)
    train_info_dataset = train_info_dataset.shuffle(
        buffer_size=num_train_sample)
    train_dataset = train_info_dataset.map(lambda x: _get_data_label_from_info(
        x, dataset, mode), num_parallel_calls=_NUM_PARALLEL_CALLS)
    train_dataset = train_dataset.repeat().batch(_BATCH_SIZE)
    train_dataset = train_dataset.prefetch(buffer_size=_PREFETCH_BUFFER_SIZE)

    # Phase 2 Testing
    # one element in this dataset is (train_info list)
    test_info_dataset = tf.data.Dataset.from_tensor_slices(
        (test_info_tensor))
    # one element in this dataset is (single image_postprocess, single label)
    test_dataset = test_info_dataset.map(lambda x: _get_data_label_from_info(
        x, dataset, mode), num_parallel_calls=_NUM_PARALLEL_CALLS)
    # one element in this dataset is (batch image_postprocess, batch label)
    test_dataset = test_dataset.batch(1).repeat()
    test_dataset = test_dataset.prefetch(buffer_size=_PREFETCH_BUFFER_SIZE)

    # iterator = dataset.make_one_shot_iterator()
    # clip_holder, label_holder = iterator.get_next()
    iterator = tf.data.Iterator.from_structure(
        train_dataset.output_types, train_dataset.output_shapes)
    train_init_op = iterator.make_initializer(train_dataset)
    test_init_op = iterator.make_initializer(test_dataset)

    clip_holder, label_holder = iterator.get_next()
    clip_holder = tf.squeeze(clip_holder,  [1])
    label_holder = tf.squeeze(label_holder, [1])
    clip_holder.set_shape(
        [None, None, _FRAME_SIZE, _FRAME_SIZE, _CHANNEL[mode]])
    dropout_holder = tf.placeholder(tf.float32)
    is_train_holder = tf.placeholder(tf.bool)

    # inference module
    # Inference Module
    with tf.variable_scope(_SCOPE[train_data.mode]):
        # insert i3d model
        model = i3d.InceptionI3d(
            400, spatial_squeeze=True, final_endpoint='Logits')
        # the line below outputs the final results with logits
        # __call__ uses _template, and _template uses _build when defined
        logits, _ = model(clip_holder, is_training=is_train_holder,
                          dropout_keep_prob=dropout_holder)
        logits_dropout = tf.nn.dropout(logits, dropout_holder)
        # To change 400 classes to the ucf101 or hdmb classes
        fc_out = tf.layers.dense(
            logits_dropout, _CLASS_NUM[dataset], use_bias=True)
        # compute the top-k results for the whole batch size
        is_in_top_1_op = tf.nn.in_top_k(fc_out, label_holder, 1)

    # Loss calculation, including L2-norm
    variable_map = {}
    train_var = []
    for variable in tf.global_variables():
        tmp = variable.name.split('/')
        if tmp[0] == _SCOPE[train_data.mode] and 'dense' not in tmp[1]:
            variable_map[variable.name.replace(':0', '')] = variable
        if tmp[-1] == 'w:0' or tmp[-1] == 'kernel:0':
            weight_l2 = tf.nn.l2_loss(variable)
            tf.add_to_collection('weight_l2', weight_l2)
    loss_weight = tf.add_n(tf.get_collection('weight_l2'), 'loss_weight')
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=label_holder, logits=fc_out))
    total_loss = loss + _WEIGHT_OF_LOSS_WEIGHT * loss_weight
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('loss_weight', loss_weight)
    tf.summary.scalar('total_loss', total_loss)

    # Import Pre-trainned model
    saver = tf.train.Saver(var_list=variable_map, reshape=True)
    saver2 = tf.train.Saver(max_to_keep=_SAVER_MAX_TO_KEEP)
    # Specific Hyperparams
    # steps for training: the number of steps on batch per epoch
    per_epoch_step = int(np.ceil(train_data.size/_BATCH_SIZE))
    # global step constant
    global_step = _GLOBAL_EPOCH * per_epoch_step
    # global step counting
    global_index = tf.Variable(0, trainable=False)

    # Set learning rate schedule by hand, also you can use an auto way
    boundaries = [10000, 20000, 30000, 40000, 50000]
    values = [_LEARNING_RATE, 0.0008, 0.0005, 0.0003, 0.0001, 5e-5]
    learning_rate = tf.train.piecewise_constant(
        global_index, boundaries, values)
    tf.summary.scalar('learning_rate', learning_rate)

    # Optimizer set-up
    # FOR BATCH norm, we then use this updata_ops
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               _MOMENTUM).minimize(total_loss, global_step=global_index)
    sess = tf.Session()
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(log_dir, sess.graph)
    sess.run(tf.global_variables_initializer())
    sess.run(train_init_op)
    saver.restore(sess, _CHECKPOINT_PATHS[train_data.mode+'_imagenet'])

    print('----Here we start!----')
    print('Output wirtes to ' + log_dir)
    # logging.info('----Here we start!----')
    step = 0
    # for one epoch
    true_count = 0
    # for 20 batches
    tmp_count = 0
    accuracy_tmp = 0
    epoch_completed = 0
    while step <= global_step:
        step += 1
        start_time = time.time()
        _, loss_now, loss_plus, is_in_top_1, summary = sess.run(
            [optimizer, total_loss, loss_weight, is_in_top_1_op, merged_summary],
            feed_dict={dropout_holder: _DROPOUT, is_train_holder: True})
        duration = time.time() - start_time
        tmp = np.sum(is_in_top_1)
        true_count += tmp
        tmp_count += tmp
        train_writer.add_summary(summary, step)
        # responsible for printing relevant results
        if step % _OUTPUT_STEP == 0:
            accuracy = tmp_count / (_OUTPUT_STEP * _BATCH_SIZE)
            print('step: %-4d, loss: %-.4f, accuracy: %.3f (%.2f sec/batch)' %
                  (step, loss_now, accuracy, float(duration)))
            logging.info('step: % -4d, loss: % -.4f,\
                             accuracy: % .3f ( % .2f sec/batch)' %
                         (step, loss_now, accuracy, float(duration)))
            tmp_count = 0
        if step % per_epoch_step == 0:
            epoch_completed += 1
            accuracy = true_count / (per_epoch_step * _BATCH_SIZE)
            print('Epoch%d, train accuracy: %.3f' %
                  (epoch_completed, accuracy))
            logging.info('Epoch%d, train accuracy: %.3f' %
                         (train_data.epoch_completed, accuracy))
            true_count = 0
            if step % per_epoch_step == 0 and accuracy > _RUN_TEST_THRESH:
                sess.run(test_init_op)
                true_count = 0
                # start test process
                print(test_data.size)
                for i in range(test_data.size):
                    # print(i,true_count)
                    is_in_top_1 = sess.run(is_in_top_1_op,
                                           feed_dict={dropout_holder: 1,
                                                      is_train_holder: False})
                    true_count += np.sum(is_in_top_1)
                accuracy = true_count / test_data.size
                true_count = 0
                # to ensure every test procedure has the same test size
                test_data.index_in_epoch = 0
                print('Epoch%d, test accuracy: %.3f' %
                      (epoch_completed, accuracy))
                logging.info('Epoch%d, test accuracy: %.3f' %
                             (train_data.epoch_completed, accuracy))
                # saving the best params in test set
                if accuracy > _SAVE_MODEL_THRESH:
                    if accuracy > accuracy_tmp:
                        accuracy_tmp = accuracy
                        saver2.save(sess, os.path.join(log_dir,
                                                       test_data.name+'_'+train_data.mode +
                                                       '_{:.3f}_model'.format(accuracy)), step)
                sess.run(train_init_op)
    train_writer.close()
    sess.close()

def\u从信息（列车信息张量、名称、模式）中获取数据和标签：
“”“为'tf.py\u func'包装，从信息列表中获取视频剪辑和标签。”“”
夹持器，标签夹持器=tf.py\u func(
处理视频，[train\u info\u tensor，name，mode]，[tf.float32，tf.int64]）
返回夹持器、标签夹持器
def过程视频（数据信息、名称、模式，is_training=True）：
“”“从数据信息列表中获取视频剪辑和标签。”“”
数据=操作数据集（名称、模式、[数据信息]）
如果是大学培训：
剪辑顺序，标签顺序=数据。下一批（1，剪辑大小）
其他：
剪辑顺序，标签顺序=数据。下一批(
1，_每个_视频_测试_大小+1，洗牌=False，数据_增大=False）
剪辑顺序=2*（剪辑顺序/255）-1
clip_seq=np.array（clip_seq，dtype='float32'）
返回剪辑顺序，标签顺序
def主数据集（数据集='ucf101'，模式='rgb'，拆分=1）：
在['rgb'，'flow']中的断言模式，“仅支持rgb数据和流数据”
log\u dir=os.path.join（\u log\u ROOT，'finetune-%s-%s-%d'%
（数据集、模式、拆分）
如果操作系统路径不存在（日志目录）：
os.makedirs（log\u dir）
logging.basicConfig（level=logging.INFO，filename=os.path.join（log_dir，'log.txt'），
filemode='w'，format='%（消息）s'）
##数据预加载###
列车信息、测试信息=分割数据(
join（'./data'，数据集，模式+'.txt'），
join（'./data'，dataset，'testlist%02d'%split+'.txt'））
#join（'/data1/yunfeng/i3d_test/data'，dataset，mode+'.txt'），
#join（'/data1/yunfeng/i3d_test/data'，dataset，'testlist%02d'%split+'.txt'））
列车数据=动作数据集（数据集、模式、列车信息）
测试数据=操作数据集（数据集、模式、测试信息）
num\u train\u sample=len（train\u信息）
#列车信息中的每个元素如下所示：
#['v_applyeemuption_g08_c01'，
#“/data4/zhouhao/dataset/ucf101/jpeg\u 256/v\u applyecompose\u g08\u c01”，
# '121', '0']
列车信息张量=tf常数（列车信息）
测试信息张量=tf.常数（测试信息）
#数据集构建
#第一阶段培训
#此数据集中的一个元素是（列车信息列表）
训练信息数据集=tf.data.dataset.from张量切片(
（列车信息张量）
#此数据集中的一个元素是（单图像_后处理，单标签）
#此数据集中的一个元素是（批处理图像\u后处理，批标签）
列车信息数据集=列车信息数据集.shuffle(
缓冲区大小=数量（列样本）
train\u dataset=train\u info\u dataset.map（λx:\u从\u info获取\u数据\u标签(
x、 数据集，模式），num\u并行调用=\u num\u并行调用）
train\u dataset=train\u dataset.repeat（）.batch（\u batch\u SIZE）
列车数据集=列车数据集。预取（缓冲区大小=\u预取\u缓冲区大小）
#第2阶段测试
#此数据集中的一个元素是（列车信息列表）
test\u info\u dataset=tf.data.dataset.from\u tensor\u切片(
（测试信息张量）
#此数据集中的一个元素是（单图像_后处理，单标签）
test\u dataset=test\u info\u dataset.map（lambda x:\u从\u info获取\u数据\u标签(
x、 数据集，模式），num\u并行调用=\u num\u并行调用）
#此数据集中的一个元素是（批处理图像\u后处理，批标签）
test\u dataset=test\u dataset.batch（1）.repeat（）
测试数据集=测试数据集。预取（缓冲区大小=\u预取\u缓冲区大小）
#iterator=dataset.make_one_shot_iterator（）
#clip\u holder，label\u holder=iterator.get\u next（）
迭代器=tf.data.iterator.from_结构(
训练数据集。输出类型，训练数据集。输出形状）
train_init_op=迭代器。生成初始值设定项（train_数据集）
test\u init\u op=iterator.make\u初始值设定项（test\u数据集）
clip\u holder，label\u holder=iterator.get\u next（）
夹持器=tf.挤压（夹持器[1]）
标签固定器=tf.挤压（标签固定器，[1]）
夹持器。设置形状(
[无，无，_帧大小，_帧大小，_通道[模式]]）
dropout_holder=tf.placeholder（tf.float32）
is_train_holder=tf.占位符（tf.bool）
#推理模块
#推理模块
使用tf.variable_scope（_scope[train_data.mode]）：
#插入i3d模型
模型=i3d.InceptionI3d(
400，空间挤压=真，最终端点='Logits'）
#下一行使用logits输出最终结果
#__调用_使用_模板，_模板在定义时使用_构建
logits，型号（夹持器，is培训=is培训夹持器，
辍学\保留\问题=辍学\持有者）
logits\u dropout=tf.nn.dropout（logits，dropout\u持有者）
#将400类更改为ucf101或hdmb类
fc_out=tf.layers.density(
logits_dropout，_CLASS_NUM[dataset]，使用_bias=True）
#计算整个批量的top-k结果
is_in_top_1_op=tf.nn.in_top_k（fc_out，标签固定器，1）
#损失计算，包括L2范数
变量_map={}
列车变量=[]
对于tf.global_variables（）中的变量：
tmp=variable.name.split（“/”）
如果tmp[0]====范围[train\u data.mode]和“dense”不在tmp[1]中：
变量映射[variable.name.replace（':0'，''）=variable
如果tmp[-1]=='w:0'或tmp[-1]=='kernel:0':
重量l2=tf.nn.l2损失（可变）
tf.将_添加到_集合（“重量_l2”，重量_l2）
损耗重量=tf.add（tf.get_收集（'weight_l2'），'loss_weight'）
损耗=tf.减少平均值（tf.nn.sparse\u softmax\u交叉熵(
标签=标签(支架),logits=fc(输出))
总损耗=损耗+损耗重量*损耗重量
tf.汇总.标量（'损失'，损失）
tf.汇总.标量（‘失重’、失重）
tf.summary.scalar（'总损耗'，总损耗）
#导入预训练模型
saver=tf.train。
mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"]) #list all the devices you want to use. 
with distribution.scope():
  mirrored_model = tf.keras.Sequential([...])
  mirrored_model.compile([...])
batch_size = 100 # must be divisible by the number of replicas
history = mirrored_model.fit(X_train, y_train, epochs=10) 

with distribution.scope():
   mirrored_model = tf.keras.models.load_model("my_mnist_model.h5")