Python 为什么tf.keras.layers.Conv2D在tflite模型中的推理需要这么长时间?

Python 为什么tf.keras.layers.Conv2D在tflite模型中的推理需要这么长时间?,python,tensorflow,keras,tensorflow-lite,Python,Tensorflow,Keras,Tensorflow Lite,我有一个resnet31(在TF2.3中编码,它是一个子类tf.keras.Layer),当导出为savedmodel时运行得很快,但当我尝试将savemodel转换为tflite模型时,运行速度非常慢。我使用tflite中提供的基准工具来计算每种层的推断时间,结果发现CONV_2D非常慢,我不明白为什么 这是基准测试工具的输出,您可以看到运行CONV_2D需要16秒: Number of nodes executed: 89 ============================== Sum

我有一个resnet31(在TF2.3中编码,它是一个子类tf.keras.Layer),当导出为savedmodel时运行得很快,但当我尝试将savemodel转换为tflite模型时,运行速度非常慢。我使用tflite中提供的基准工具来计算每种层的推断时间,结果发现CONV_2D非常慢,我不明白为什么

这是基准测试工具的输出,您可以看到运行CONV_2D需要16秒:

Number of nodes executed: 89
============================== Summary by node type ==============================
                 [Node type]      [count]     [avg ms]      [avg %]     [cdf %]   [mem KB]  [times called]
                     CONV_2D           33    16353.751      99.844%     99.844%      0.000         33
                       WHILE            2       20.836       0.127%     99.971%      0.000          2
                         ADD           11        1.956       0.012%     99.983%      0.000         11
                 MAX_POOL_2D            3        1.496       0.009%     99.992%      0.000          3
                  REDUCE_MAX            1        0.881       0.005%     99.997%      0.000          1
             FULLY_CONNECTED           13        0.339       0.002%     99.999%      0.000         13
                  REVERSE_V2            2        0.029       0.000%    100.000%      0.000          2
                   TRANSPOSE            2        0.020       0.000%    100.000%      0.000          2
                        FILL            4        0.016       0.000%    100.000%      0.000          4
               STRIDED_SLICE            5        0.011       0.000%    100.000%      0.000          5
                        PACK            5        0.011       0.000%    100.000%      0.000          5
                         MUL            2        0.004       0.000%    100.000%      0.000          2
               CONCATENATION            1        0.003       0.000%    100.000%      0.000          1
                       SHAPE            3        0.002       0.000%    100.000%      0.000          3
                     ONE_HOT            1        0.001       0.000%    100.000%      0.000          1
                     RESHAPE            1        0.000       0.000%    100.000%      0.000          1
我通过以下方式转换了savedmodel:

converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY]
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]

tflite_model = converter.convert()
import tensorflow as tf

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Conv2D, BatchNormalization, Lambda, MaxPool2D, AveragePooling2D, Dense, Layer
from tensorflow.keras import Model




class ResnetIdentityV1(Layer):

    """
    
    https://arxiv.org/pdf/1512.03385.pdf
    This function is a the basic resnet V1 block (with bottleneck)

    """

    def __init__(self, filters, conv_shortcut):

        """

        Parameters
        ----------
        conv_shortcut
        filters

        """

        super(ResnetIdentityV1, self).__init__()
        self.filters = filters
        # conv 1 
        self.conv1 = Conv2D(filters=filters,
                            kernel_size=(3, 3),
                            strides=1,
                            padding='same')
        self.bn1 = BatchNormalization()

        # conv 2
        self.conv2 = Conv2D(filters=filters,
                            kernel_size=(3,3),
                            strides=1,
                            padding='same')
        self.bn2 = BatchNormalization()


        # down-sampling
        self.downsample = Sequential()
        if conv_shortcut:
            self.downsample.add(Conv2D(filters=filters,
                                       kernel_size=(1, 1),
                                       strides=1))
            self.downsample.add(BatchNormalization())
        else:
            self.downsample.add(Lambda(lambda x: x))


    def call(self, inputs, training=False, **kwargs):

        """

        Parameters

        input_tensor is a 4-d tensor : NHWC
        training a boolean (used for batch normalization)

        """

        shortcut = self.downsample(inputs, training=training)

        # first conv-bn-relu 
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = tf.nn.relu(x)

        # second conv-bn-relu
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = tf.nn.relu(x + shortcut)

        return x


class ResnetBlock31V1(Layer):

    """

    Block of a resnet31 with n identity blocks in it
    https://arxiv.org/pdf/1512.03385.pdf

    """


    def __init__(self, filters, blocks):
        super(ResnetBlock31V1, self).__init__()
 
        self.identity_block = ResnetIdentityV1
        self.res_block = Sequential()
        self.res_block.add(self.identity_block(filters, conv_shortcut=True))
        for _ in range(1, blocks):
            self.res_block.add(self.identity_block(filters, conv_shortcut=False))
             
    def call(self, inputs, training=False, **kwargs):
        x = self.res_block(inputs, training=training)
        return x


class ResNet31V1(Layer):

    """

    Resnet31 Model 
    https://arxiv.org/pdf/1512.03385.pdf
    downsampling of 4x8
    
    """


    def __init__(self, blocks_filters, blocks_repetition, 
                 num_classes=None):

        super(ResNet31V1, self).__init__()

        self.conv1 = Conv2D(filters=64,kernel_size=3,strides=1,padding='same')
        self.conv2 = Conv2D(filters=128,kernel_size=3,strides=1,padding='same')

        self.bn1 = BatchNormalization()
        self.bn2 = BatchNormalization()

        self.pool1 = MaxPool2D(pool_size=2,strides=2,padding='valid')

        self.resnet_layers = []
        for f, b in zip(blocks_filters, blocks_repetition):
            self.resnet_layers.append(
                ResnetBlock31V1(filters=f, blocks=b))

        self.n_layers = len(self.resnet_layers)

        self.conv3 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
        self.bn3 = BatchNormalization()
        self.pool2 = MaxPool2D(pool_size=2,strides=2,padding='valid')

        self.conv4 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
        self.bn4 = BatchNormalization()
        self.pool3 = MaxPool2D(pool_size=(2,1),strides=(2,1),padding='valid')

        self.conv5 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
        self.bn5 = BatchNormalization()
        self.conv6 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
        self.bn6 = BatchNormalization()

    def call(self, inputs, training=False, **kwargs):

        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = tf.nn.relu(x)
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = tf.nn.relu(x)
        x = self.pool1(x)

        x = self.resnet_layers[0](x, training=training)
        x = self.conv3(x)
        x = self.bn3(x, training=training)
        x = tf.nn.relu(x)
        x = self.pool2(x)

        x = self.resnet_layers[1](x, training=training)
        x = self.conv4(x)
        x = self.bn4(x, training=training)
        x = tf.nn.relu(x)
        x = self.pool3(x)

        x = self.resnet_layers[2](x, training=training)
        x = self.conv5(x)
        x = self.bn5(x, training=training)
        x = tf.nn.relu(x)

        x = self.resnet_layers[3](x, training=training)
        x = self.conv6(x)
        x = self.bn6(x, training=training)
        x = tf.nn.relu(x)

        return x

def resnet_v1_31(num_classes=None):
    model = ResNet31V1(blocks_filters=[256, 256, 512, 512],
                     blocks_repetition=[1, 2, 5, 3],
                     num_classes=num_classes)
    return model
resnet31的编码方式如下:

converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY]
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]

tflite_model = converter.convert()
import tensorflow as tf

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Conv2D, BatchNormalization, Lambda, MaxPool2D, AveragePooling2D, Dense, Layer
from tensorflow.keras import Model




class ResnetIdentityV1(Layer):

    """
    
    https://arxiv.org/pdf/1512.03385.pdf
    This function is a the basic resnet V1 block (with bottleneck)

    """

    def __init__(self, filters, conv_shortcut):

        """

        Parameters
        ----------
        conv_shortcut
        filters

        """

        super(ResnetIdentityV1, self).__init__()
        self.filters = filters
        # conv 1 
        self.conv1 = Conv2D(filters=filters,
                            kernel_size=(3, 3),
                            strides=1,
                            padding='same')
        self.bn1 = BatchNormalization()

        # conv 2
        self.conv2 = Conv2D(filters=filters,
                            kernel_size=(3,3),
                            strides=1,
                            padding='same')
        self.bn2 = BatchNormalization()


        # down-sampling
        self.downsample = Sequential()
        if conv_shortcut:
            self.downsample.add(Conv2D(filters=filters,
                                       kernel_size=(1, 1),
                                       strides=1))
            self.downsample.add(BatchNormalization())
        else:
            self.downsample.add(Lambda(lambda x: x))


    def call(self, inputs, training=False, **kwargs):

        """

        Parameters

        input_tensor is a 4-d tensor : NHWC
        training a boolean (used for batch normalization)

        """

        shortcut = self.downsample(inputs, training=training)

        # first conv-bn-relu 
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = tf.nn.relu(x)

        # second conv-bn-relu
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = tf.nn.relu(x + shortcut)

        return x


class ResnetBlock31V1(Layer):

    """

    Block of a resnet31 with n identity blocks in it
    https://arxiv.org/pdf/1512.03385.pdf

    """


    def __init__(self, filters, blocks):
        super(ResnetBlock31V1, self).__init__()
 
        self.identity_block = ResnetIdentityV1
        self.res_block = Sequential()
        self.res_block.add(self.identity_block(filters, conv_shortcut=True))
        for _ in range(1, blocks):
            self.res_block.add(self.identity_block(filters, conv_shortcut=False))
             
    def call(self, inputs, training=False, **kwargs):
        x = self.res_block(inputs, training=training)
        return x


class ResNet31V1(Layer):

    """

    Resnet31 Model 
    https://arxiv.org/pdf/1512.03385.pdf
    downsampling of 4x8
    
    """


    def __init__(self, blocks_filters, blocks_repetition, 
                 num_classes=None):

        super(ResNet31V1, self).__init__()

        self.conv1 = Conv2D(filters=64,kernel_size=3,strides=1,padding='same')
        self.conv2 = Conv2D(filters=128,kernel_size=3,strides=1,padding='same')

        self.bn1 = BatchNormalization()
        self.bn2 = BatchNormalization()

        self.pool1 = MaxPool2D(pool_size=2,strides=2,padding='valid')

        self.resnet_layers = []
        for f, b in zip(blocks_filters, blocks_repetition):
            self.resnet_layers.append(
                ResnetBlock31V1(filters=f, blocks=b))

        self.n_layers = len(self.resnet_layers)

        self.conv3 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
        self.bn3 = BatchNormalization()
        self.pool2 = MaxPool2D(pool_size=2,strides=2,padding='valid')

        self.conv4 = Conv2D(filters=256,kernel_size=3,strides=1,padding='same')
        self.bn4 = BatchNormalization()
        self.pool3 = MaxPool2D(pool_size=(2,1),strides=(2,1),padding='valid')

        self.conv5 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
        self.bn5 = BatchNormalization()
        self.conv6 = Conv2D(filters=512,kernel_size=3,strides=1,padding='same')
        self.bn6 = BatchNormalization()

    def call(self, inputs, training=False, **kwargs):

        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = tf.nn.relu(x)
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = tf.nn.relu(x)
        x = self.pool1(x)

        x = self.resnet_layers[0](x, training=training)
        x = self.conv3(x)
        x = self.bn3(x, training=training)
        x = tf.nn.relu(x)
        x = self.pool2(x)

        x = self.resnet_layers[1](x, training=training)
        x = self.conv4(x)
        x = self.bn4(x, training=training)
        x = tf.nn.relu(x)
        x = self.pool3(x)

        x = self.resnet_layers[2](x, training=training)
        x = self.conv5(x)
        x = self.bn5(x, training=training)
        x = tf.nn.relu(x)

        x = self.resnet_layers[3](x, training=training)
        x = self.conv6(x)
        x = self.bn6(x, training=training)
        x = tf.nn.relu(x)

        return x

def resnet_v1_31(num_classes=None):
    model = ResNet31V1(blocks_filters=[256, 256, 512, 512],
                     blocks_repetition=[1, 2, 5, 3],
                     num_classes=num_classes)
    return model

提前感谢您的帮助

您的输入大小是多少?您是否尝试过其他委派/增加cpu线程?我说的对吗?你正在使用android基准测试工具?