Tensorflow 当尝试使用估计器进行分布式培训时,采用独立的客户机模式?
我正在远程访问我的大学2台机器。为此,我使用了TensorFlow多工作者镜像策略。我试图在两台机器上部署一个deep模型。为此,代码行为:Tensorflow 当尝试使用估计器进行分布式培训时,采用独立的客户机模式?,tensorflow,deep-learning,conv-neural-network,distributed-computing,tensorflow-estimator,Tensorflow,Deep Learning,Conv Neural Network,Distributed Computing,Tensorflow Estimator,我正在远程访问我的大学2台机器。为此,我使用了TensorFlow多工作者镜像策略。我试图在两台机器上部署一个deep模型。为此,代码行为: os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'worker': ["gpu11.cse.cuhk.edu.hk:8000", "gpu12.cse.cuhk.edu.hk:8000"] }, 'task': {'typ
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["gpu11.cse.cuhk.edu.hk:8000", "gpu12.cse.cuhk.edu.hk:8000"]
},
'task': {'type': 'worker', 'index': 0}
})
我不确定这个工人地址是syntex,可以吗
**完整代码:**
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
import os
import json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.INFO)
from tensorflow.keras.datasets import mnist
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["gpu11.cse.cuhk.edu.hk:8000", "gpu12.cse.cuhk.edu.hk:8000"]
},
'task': {'type': 'worker', 'index': 0}
})
def cnn_model_fn(features, labels, mode):
"""Model function for CNN."""
# Input Layer
# Reshape X to 4-D tensor: [batch_size, width, height, channels]
# MNIST images are 28x28 pixels, and have one color channel
input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
input_layer = tf.cast(input_layer, tf.float32)
labels = tf.cast(labels, tf.int32)
# Convolutional Layer #1
# Computes 32 features using a 5x5 filter with ReLU activation.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 28, 28, 1]
# Output Tensor Shape: [batch_size, 28, 28, 32]
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #1
# First max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 28, 28, 32]
# Output Tensor Shape: [batch_size, 14, 14, 32]
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
# Convolutional Layer #2
# Computes 64 features using a 5x5 filter.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 14, 14, 32]
# Output Tensor Shape: [batch_size, 14, 14, 64]
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #2
# Second max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 14, 14, 64]
# Output Tensor Shape: [batch_size, 7, 7, 64]
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
# Flatten tensor into a batch of vectors
# Input Tensor Shape: [batch_size, 7, 7, 64]
# Output Tensor Shape: [batch_size, 7 * 7 * 64]
pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
# Dense Layer
# Densely connected layer with 1024 neurons
# Input Tensor Shape: [batch_size, 7 * 7 * 64]
# Output Tensor Shape: [batch_size, 1024]
dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
# Add dropout operation; 0.6 probability that element will be kept
dropout = tf.layers.dropout(
inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
# Logits layer
# Input Tensor Shape: [batch_size, 1024]
# Output Tensor Shape: [batch_size, 10]
logits = tf.layers.dense(inputs=dropout, units=10)
predictions = {
# Generate predictions (for PREDICT and EVAL mode)
"classes": tf.argmax(input=logits, axis=1),
# Add `softmax_tensor` to the graph. It is used for PREDICT and by the
# `logging_hook`.
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def per_device_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that this should eventually be handled by DistributionStrategies
directly. Multi-GPU support is currently experimental, however,
so doing the work here until that feature is in place.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
class InputFnProvider:
def __init__(self, train_batch_size):
self.train_batch_size = train_batch_size
self.__load_data()
def __load_data(self):
# Load training and eval data
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
#batch_size
# X_train = tf.cast(X_train, tf.float32)
#X_test = tf.cast(X_test, tf.float32)
# mnist = tf.compat.v1.contrib.learn.datasets.load_dataset("mnist")
self.train_data = X_train # Returns np.array
self.train_labels = Y_train
self.eval_data = X_test # Returns np.array
self.eval_labels = Y_test
def train_input_fn(self):
"""An input function for training"""
# Shuffle, repeat, and batch the examples.
dataset = tf.data.Dataset.from_tensor_slices(({"x": self.train_data}, self.train_labels))
dataset = dataset.shuffle(1000).repeat().batch(self.train_batch_size)
return dataset
def eval_input_fn(self):
"""An input function for evaluation or prediction"""
dataset = tf.data.Dataset.from_tensor_slices(({"x": self.eval_data}, self.eval_labels))
dataset = dataset.batch(1)
return dataset
def main(unused_argv):
batch_size = 100
num_gpus = 2
# input_fn which serves Dataset
input_fn_provider = InputFnProvider(per_device_batch_size(batch_size, num_gpus))
# Use multiple GPUs by MirroredStragtegy.
# All avaiable GPUs will be used if `num_gpus` is omitted.
if num_gpus > 1:
distribution = tf.distribute.experimental.MultiWorkerMirroredStrategy()
else:
distribution = None
# Pass to RunConfig
config = tf.estimator.RunConfig(
train_distribute=distribution,
model_dir="/tmp/mnist_convnet_model")
# Create the Estimator
# pass RunConfig
mnist_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn,
config=config)
# Train the model
mnist_classifier.train(
input_fn=input_fn_provider.train_input_fn,
steps=1000)
# Evaluate the model and print results
eval_results = mnist_classifier.evaluate(input_fn=input_fn_provider.eval_input_fn)
print(eval_results)
if __name__ == "__main__":
tf.app.run()
来自未来导入绝对导入
来自未来进口部
来自未来导入打印功能
将numpy作为np导入
导入tensorflow作为tf
导入操作系统
导入json
将tensorflow.compat.v1导入为tf
tf.disable_v2_behavior()
tf.logging.set_详细性(tf.logging.INFO)
从tensorflow.keras.datasets导入mnist
os.environ['TF_CONFIG']=json.dumps({
“集群”:{
‘worker’:[“gpu11.cse.cuhk.edu.hk:8000”,“gpu12.cse.cuhk.edu.hk:8000”]
},
'task':{'type':'worker','index':0}
})
def cnn_model_fn(功能、标签、模式):
“”“CNN的模型函数。”“”
#输入层
#将X重塑为4-D张量:[批量大小、宽度、高度、通道]
#MNIST图像为28x28像素,具有一个颜色通道
输入层=tf.重塑(特征[“x”],[-1,28,28,1])
输入层=tf.cast(输入层,tf.float32)
labels=tf.cast(labels,tf.int32)
#卷积层#1
#使用带有ReLU激活的5x5过滤器计算32个功能。
#添加填充以保留宽度和高度。
#输入张量形状:[批量大小,28,28,1]
#输出张量形状:[批量大小,28,28,32]
conv1=tf.layers.conv2d(
输入=输入层,
过滤器=32,
内核大小=[5,5],
padding=“相同”,
激活=tf.nn.relu)
#池层#1
#第一个最大池层,具有2x2过滤器和2的步长
#输入张量形状:[批量大小,28,28,32]
#输出张量形状:[批量大小,14,14,32]
pool1=tf.layers.max_poolg2d(输入=conv1,pool_size=[2,2],步长=2)
#卷积层#2
#使用5x5过滤器计算64个要素。
#添加填充以保留宽度和高度。
#输入张量形状:[批量大小,14,14,32]
#输出张量形状:[批量大小,14,14,64]
conv2=tf.layers.conv2d(
输入=池1,
过滤器=64,
内核大小=[5,5],
padding=“相同”,
激活=tf.nn.relu)
#池层#2
#第二个最大池层,具有2x2过滤器和2的步长
#输入张量形状:[批量大小,14,14,64]
#输出张量形状:[批量大小,7,7,64]
pool2=tf.layers.max_poolg2d(输入=conv2,pool_size=[2,2],步长=2)
#将张量展平为一批向量
#输入张量形状:[批量大小,7,7,64]
#输出张量形状:[批量大小,7*7*64]
pool2_flat=tf.重塑(pool2,[-1,7*7*64])
#致密层
#有1024个神经元的密集连接层
#输入张量形状:[批量大小,7*7*64]
#输出张量形状:[批量大小,1024]
密集=tf.layers.dense(输入=池2_平坦,单位=1024,激活=tf.nn.relu)
#添加退出操作;0.6元素保留的概率
dropout=tf.layers.dropout(
输入=密集,速率=0.4,训练=模式==tf.estimator.ModeKeys.TRAIN)
#登录层
#输入张量形状:[批量大小,1024]
#输出张量形状:[批量大小,10]
logits=tf.layers.densite(输入=辍学,单位=10)
预测={
#生成预测(用于预测和评估模式)
“类”:tf.argmax(输入=logits,轴=1),
#将'softmax_tensor'添加到图形中。它用于预测和
#'logging_hook`。
“概率”:tf.nn.softmax(logits,name=“softmax\u tensor”)
}
如果mode==tf.estimator.ModeKeys.PREDICT:
返回tf.estimator.EstimatorSpec(模式=模式,预测=预测)
#计算损失(列车和评估模式)
损耗=tf.loss.sparse\u softmax\u cross\u熵(标签=labels,logits=logits)
#配置培训Op(针对培训模式)
如果模式==tf.estimator.ModeKeys.TRAIN:
优化器=tf.train.GradientDescentOptimizer(学习率=0.001)
列车运行=优化器。最小化(
损失=损失,
global\u step=tf.train.get\u global\u step())
返回tf.estimator.EstimatorSpec(模式=模式,损耗=损耗,列车运行=列车运行)
#添加评估指标(用于评估模式)
评估指标操作={
“准确性”:tf.metrics.accurity(
标签=标签,预测=预测[“类”])}
返回tf.estimator.estimator规范(
模式=模式,损耗=损耗,评估度量操作=评估度量操作)
每个设备的def批次大小(批次大小,数量GPU):
“”“对于多gpu,批大小必须是gpu数量的倍数。
请注意,这最终应该由DistributionStrategies处理
直接。多GPU支持目前处于试验阶段,
因此,在该功能就位之前,请在此处执行此工作。
Args:
批次大小:要在设备之间划分的全局批次大小。这应该是
等于num_GPU乘以用于多GPU培训的单个GPU批量大小。
num_gpu:分配策略使用了多少个gpu。
返回:
每个设备的批量大小。
提出:
ValueError:如果批处理大小不能被设备数整除
"""
如果num_gpus 1:
distribution=tf.distribute.experimental.MultiWorkerMirroredStrategy()
其他:
分布=无
#传递到RunConfig
config=tf.estimator.RunConfig(
列车分配=分配,
model_dir=“/tmp/mnist_convnet_model”)
#创建估计器
#传递RunConfig
mnist_分类器=tf.estimator.estimator(
model_fn=cnn_model_fn,
config=config)
#训练模型
mnist_.train(
输入\u fn=输入\u fn\u提供程序。训练\u输入\u fn,
步数=1000)
#评估模型并打印结果
eval\u results=mnist\u classifier.evaluate(输入\u fn=input\u fn\u provider.eval\u输入\u fn)
打印(评估)
File "mnist.py", line 223, in <module>
tf.app.run()
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "mnist.py", line 213, in main
mnist_classifier.train(
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 349, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1173, in _train_model
return self._train_model_distributed(input_fn, hooks, saving_listeners)
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1226, in _train_model_distributed
distribute_coordinator_training.estimator_train(
File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/estimator_training.py", line 310, in estimator_train
raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
ValueError: Only `STANDALONE_CLIENT` mode is supported when you call `estimator.train`