Python 为什么tf.keras.layers.Conv2D在tflite模型中的推理需要这么长时间?
我有一个resnet31(在TF2.3中编码,它是一个子类tf.keras.Layer),当导出为savedmodel时运行得很快,但当我尝试将savemodel转换为tflite模型时,运行速度非常慢。我使用tflite中提供的基准工具来计算每种层的推断时间,结果发现CONV_2D非常慢,我不明白为什么 这是基准测试工具的输出,您可以看到运行CONV_2D需要16秒:Python 为什么tf.keras.layers.Conv2D在tflite模型中的推理需要这么长时间?,python,tensorflow,keras,tensorflow-lite,Python,Tensorflow,Keras,Tensorflow Lite,我有一个resnet31(在TF2.3中编码,它是一个子类tf.keras.Layer),当导出为savedmodel时运行得很快,但当我尝试将savemodel转换为tflite模型时,运行速度非常慢。我使用tflite中提供的基准工具来计算每种层的推断时间,结果发现CONV_2D非常慢,我不明白为什么 这是基准测试工具的输出,您可以看到运行CONV_2D需要16秒: Number of nodes executed: 89 ============================== Sum
Number of nodes executed: 89
============================== Summary by node type ==============================
[Node type] [count] [avg ms] [avg %] [cdf %] [mem KB] [times called]
CONV_2D 33 16353.751 99.844% 99.844% 0.000 33
WHILE 2 20.836 0.127% 99.971% 0.000 2
ADD 11 1.956 0.012% 99.983% 0.000 11
MAX_POOL_2D 3 1.496 0.009% 99.992% 0.000 3
REDUCE_MAX 1 0.881 0.005% 99.997% 0.000 1
FULLY_CONNECTED 13 0.339 0.002% 99.999% 0.000 13
REVERSE_V2 2 0.029 0.000% 100.000% 0.000 2
TRANSPOSE 2 0.020 0.000% 100.000% 0.000 2
FILL 4 0.016 0.000% 100.000% 0.000 4
STRIDED_SLICE 5 0.011 0.000% 100.000% 0.000 5
PACK 5 0.011 0.000% 100.000% 0.000 5
MUL 2 0.004 0.000% 100.000% 0.000 2
CONCATENATION 1 0.003 0.000% 100.000% 0.000 1
SHAPE 3 0.002 0.000% 100.000% 0.000 3
ONE_HOT 1 0.001 0.000% 100.000% 0.000 1
RESHAPE 1 0.000 0.000% 100.000% 0.000 1
我通过以下方式转换了savedmodel:
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY]
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
tflite_model = converter.convert()
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Conv2D, BatchNormalization, Lambda, MaxPool2D, AveragePooling2D, Dense, Layer
from tensorflow.keras import Model
class ResnetIdentityV1(Layer):
"""
https://arxiv.org/pdf/1512.03385.pdf
This function is a the basic resnet V1 block (with bottleneck)
"""
def __init__(self, filters, conv_shortcut):
"""
Parameters
----------
conv_shortcut
filters
"""
super(ResnetIdentityV1, self).__init__()
self.filters = filters
# conv 1
self.conv1 = Conv2D(filters=filters,
kernel_size=(3, 3),
strides=1,
padding='same')
self.bn1 = BatchNormalization()
# conv 2
self.conv2 = Conv2D(filters=filters,
kernel_size=(3,3),
strides=1,
padding='same')
self.bn2 = BatchNormalization()
# down-sampling
self.downsample = Sequential()
if conv_shortcut:
self.downsample.add(Conv2D(filters=filters,
kernel_size=(1, 1),
strides=1))
self.downsample.add(BatchNormalization())
else:
self.downsample.add(Lambda(lambda x: x))
def call(self, inputs, training=False, **kwargs):
"""
Parameters
input_tensor is a 4-d tensor : NHWC
training a boolean (used for batch normalization)
"""
shortcut = self.downsample(inputs, training=training)
# first conv-bn-relu
x = self.conv1(inputs)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
# second conv-bn-relu
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x + shortcut)
return x
class ResnetBlock31V1(Layer):
"""
Block of a resnet31 with n identity blocks in it
https://arxiv.org/pdf/1512.03385.pdf
"""
def __init__(self, filters, blocks):
super(ResnetBlock31V1, self).__init__()
self.identity_block = ResnetIdentityV1
self.res_block = Sequential()
self.res_block.add(self.identity_block(filters, conv_shortcut=True))
for _ in range(1, blocks):
self.res_block.add(self.identity_block(filters, conv_shortcut=False))
def call(self, inputs, training=False, **kwargs):
x = self.res_block(inputs, training=training)
return x
class ResNet31V1(Layer):
"""
Resnet31 Model
https://arxiv.org/pdf/1512.03385.pdf
downsampling of 4x8
"""
def __init__(self, blocks_filters, blocks_repetition,
num_classes=None):
super(ResNet31V1, self).__init__()
self.conv1 = Conv2D(filters=64,kernel_size=3,strides=1,padding='same')
self.conv2 = Conv2D(filters=128,kernel_size=3,strides=1,padding='same')
self.bn1 = BatchNormalization()
self.bn2 = BatchNormalization()
self.pool1 = MaxPool2D(pool_size=2,strides=2,padding='valid')
self.resnet_layers = []
for f, b in zip(blocks_filters, blocks_repetition):
self.resnet_layers.append(
ResnetBlock31V1(filters=f, blocks=b))
self.n_layers = len(self.resnet_layers)
self.conv3 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
self.bn3 = BatchNormalization()
self.pool2 = MaxPool2D(pool_size=2,strides=2,padding='valid')
self.conv4 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
self.bn4 = BatchNormalization()
self.pool3 = MaxPool2D(pool_size=(2,1),strides=(2,1),padding='valid')
self.conv5 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
self.bn5 = BatchNormalization()
self.conv6 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
self.bn6 = BatchNormalization()
def call(self, inputs, training=False, **kwargs):
x = self.conv1(inputs)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x)
x = self.pool1(x)
x = self.resnet_layers[0](x, training=training)
x = self.conv3(x)
x = self.bn3(x, training=training)
x = tf.nn.relu(x)
x = self.pool2(x)
x = self.resnet_layers[1](x, training=training)
x = self.conv4(x)
x = self.bn4(x, training=training)
x = tf.nn.relu(x)
x = self.pool3(x)
x = self.resnet_layers[2](x, training=training)
x = self.conv5(x)
x = self.bn5(x, training=training)
x = tf.nn.relu(x)
x = self.resnet_layers[3](x, training=training)
x = self.conv6(x)
x = self.bn6(x, training=training)
x = tf.nn.relu(x)
return x
def resnet_v1_31(num_classes=None):
model = ResNet31V1(blocks_filters=[256, 256, 512, 512],
blocks_repetition=[1, 2, 5, 3],
num_classes=num_classes)
return model
resnet31的编码方式如下:
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY]
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
tflite_model = converter.convert()
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Conv2D, BatchNormalization, Lambda, MaxPool2D, AveragePooling2D, Dense, Layer
from tensorflow.keras import Model
class ResnetIdentityV1(Layer):
"""
https://arxiv.org/pdf/1512.03385.pdf
This function is a the basic resnet V1 block (with bottleneck)
"""
def __init__(self, filters, conv_shortcut):
"""
Parameters
----------
conv_shortcut
filters
"""
super(ResnetIdentityV1, self).__init__()
self.filters = filters
# conv 1
self.conv1 = Conv2D(filters=filters,
kernel_size=(3, 3),
strides=1,
padding='same')
self.bn1 = BatchNormalization()
# conv 2
self.conv2 = Conv2D(filters=filters,
kernel_size=(3,3),
strides=1,
padding='same')
self.bn2 = BatchNormalization()
# down-sampling
self.downsample = Sequential()
if conv_shortcut:
self.downsample.add(Conv2D(filters=filters,
kernel_size=(1, 1),
strides=1))
self.downsample.add(BatchNormalization())
else:
self.downsample.add(Lambda(lambda x: x))
def call(self, inputs, training=False, **kwargs):
"""
Parameters
input_tensor is a 4-d tensor : NHWC
training a boolean (used for batch normalization)
"""
shortcut = self.downsample(inputs, training=training)
# first conv-bn-relu
x = self.conv1(inputs)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
# second conv-bn-relu
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x + shortcut)
return x
class ResnetBlock31V1(Layer):
"""
Block of a resnet31 with n identity blocks in it
https://arxiv.org/pdf/1512.03385.pdf
"""
def __init__(self, filters, blocks):
super(ResnetBlock31V1, self).__init__()
self.identity_block = ResnetIdentityV1
self.res_block = Sequential()
self.res_block.add(self.identity_block(filters, conv_shortcut=True))
for _ in range(1, blocks):
self.res_block.add(self.identity_block(filters, conv_shortcut=False))
def call(self, inputs, training=False, **kwargs):
x = self.res_block(inputs, training=training)
return x
class ResNet31V1(Layer):
"""
Resnet31 Model
https://arxiv.org/pdf/1512.03385.pdf
downsampling of 4x8
"""
def __init__(self, blocks_filters, blocks_repetition,
num_classes=None):
super(ResNet31V1, self).__init__()
self.conv1 = Conv2D(filters=64,kernel_size=3,strides=1,padding='same')
self.conv2 = Conv2D(filters=128,kernel_size=3,strides=1,padding='same')
self.bn1 = BatchNormalization()
self.bn2 = BatchNormalization()
self.pool1 = MaxPool2D(pool_size=2,strides=2,padding='valid')
self.resnet_layers = []
for f, b in zip(blocks_filters, blocks_repetition):
self.resnet_layers.append(
ResnetBlock31V1(filters=f, blocks=b))
self.n_layers = len(self.resnet_layers)
self.conv3 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
self.bn3 = BatchNormalization()
self.pool2 = MaxPool2D(pool_size=2,strides=2,padding='valid')
self.conv4 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
self.bn4 = BatchNormalization()
self.pool3 = MaxPool2D(pool_size=(2,1),strides=(2,1),padding='valid')
self.conv5 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
self.bn5 = BatchNormalization()
self.conv6 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
self.bn6 = BatchNormalization()
def call(self, inputs, training=False, **kwargs):
x = self.conv1(inputs)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x)
x = self.pool1(x)
x = self.resnet_layers[0](x, training=training)
x = self.conv3(x)
x = self.bn3(x, training=training)
x = tf.nn.relu(x)
x = self.pool2(x)
x = self.resnet_layers[1](x, training=training)
x = self.conv4(x)
x = self.bn4(x, training=training)
x = tf.nn.relu(x)
x = self.pool3(x)
x = self.resnet_layers[2](x, training=training)
x = self.conv5(x)
x = self.bn5(x, training=training)
x = tf.nn.relu(x)
x = self.resnet_layers[3](x, training=training)
x = self.conv6(x)
x = self.bn6(x, training=training)
x = tf.nn.relu(x)
return x
def resnet_v1_31(num_classes=None):
model = ResNet31V1(blocks_filters=[256, 256, 512, 512],
blocks_repetition=[1, 2, 5, 3],
num_classes=num_classes)
return model
提前感谢您的帮助 您的输入大小是多少?您是否尝试过其他委派/增加cpu线程?我说的对吗?你正在使用android基准测试工具?