TensorFlow中的MatMul比numpy中的点积慢_Numpy_Tensorflow

TensorFlow中的MatMul比numpy中的点积慢

numpy tensorflow

TensorFlow中的MatMul比numpy中的点积慢,numpy,tensorflow,Numpy,Tensorflow,我观察到，在我的机器上，tensorflow中的tf.matmul运行速度明显慢于numpy中的点积。我有GTX1080GPU，希望tf.matmul至少和使用CPU（numpy）运行代码时一样快环境信息操作系统 lsb_release -a No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 16.10 Release: 16.10 Codename: yakkety CUDA和

我观察到，在我的机器上，tensorflow中的tf.matmul运行速度明显慢于numpy中的点积。我有GTX1080GPU，希望tf.matmul至少和使用CPU（numpy）运行代码时一样快

环境信息

操作系统

lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 16.10
Release:    16.10
Codename:   yakkety

CUDA和cuDNN的安装版本：

ls -l /usr/local/cuda-8.0/lib64/libcud*
-rw-r--r-- 1 root      root    556000 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudadevrt.a
lrwxrwxrwx 1 root      root        16 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so -> libcudart.so.8.0
lrwxrwxrwx 1 root      root        19 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0 -> libcudart.so.8.0.61
-rwxr-xr-x 1 root      root    415432 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0.61
-rw-r--r-- 1 root      root    775162 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart_static.a
lrwxrwxrwx 1 voldemaro users       13 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so -> libcudnn.so.5
lrwxrwxrwx 1 voldemaro users       18 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5 -> libcudnn.so.5.1.10
-rwxr-xr-x 1 voldemaro users 84163560 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5.1.10
-rw-r--r-- 1 voldemaro users 70364814 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn_static.a

'''
Created on Sep 28, 2017

@author: voldemaro

Running on I7/GTX 1080

no MKL
('TF version: ', 'v1.0.0-rc2-15-g47bba63-dirty')
('TF url: ', 'https://github.com/tensorflow/tensorflow/commit/47bba63')
Timing in ms for 2048 x 2048 SVD of type <type 'numpy.float32'> and matmul for 16920 x 2048 of type <type 'numpy.float32'>
numpy default SVD    min:  3956.20, median:  4127.75, mean:  4264.41
TF CPU SVD           min:  5926.43, median:  5951.70, mean:  5961.43
TF GPU SVD           min:  5917.10, median:  6015.87, mean:  6039.63
numpy default .dot product min:  5816.97, median:  5933.43, mean:  5965.22
TF CPU matmul        min: 21939.19, median: 22485.99, mean: 22374.69
TF GPU matmul        min: 22026.52, median: 22109.97, mean: 22199.43
'''

from scipy import linalg;  # for svd
import numpy as np;
import os;
import sys;
import time;

os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"  # nospam

import tensorflow as tf;
import gc; gc.disable();

NUM_RUNS = 5;
dtype = np.float32;
N=2048;
M =  16920;


def get_tensorflow_version_url():
    import tensorflow as tf
    version=tf.__version__
    commit = tf.__git_version__
    # commit looks like this
    # 'v1.0.0-65-g4763edf-dirty'
    commit = commit.replace("'","")
    if commit.endswith('-dirty'):
        dirty = True
        commit = commit[:-len('-dirty')]
    commit=commit.rsplit('-g', 1)[1]
    url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
    return url

def get_mkl_version():
    import ctypes
    import numpy as np
    ver = np.zeros(199, dtype=np.uint8)
    mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
    mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
    return ver[ver != 0].tostring()

timeline_counter = 0
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE);


def benchmark(message, func):
    time_list = []
    for i in range(NUM_RUNS):
        start_time = time.time();
        func();
        time_list.append(time.time()-start_time);

    time_list = 1000*np.array(time_list);  # get seconds, convert to ms
    if len(time_list)>0:
        min = np.min(time_list);
        median = np.median(time_list);
        formatted = ["%.2f"%(d,) for d in time_list[:10]];
        result = "min: %8.2f, median: %8.2f, mean: %8.2f"%(min, median, np.mean(time_list))
    else:
        result = "empty"
    print("%-20s %s"%(message, result))


if np.__config__.get_info("lapack_mkl_info"):
    print("MKL version", get_mkl_version())
else:
    print("no MKL")

print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())


svd_array = np.random.random_sample((N,N)).astype(dtype);
another_array = np.random.random_sample((M,N)).astype(dtype);

init_OP = tf.global_variables_initializer();


with tf.device("/gpu:0"):
    init_holder_gpu = tf.placeholder(dtype, shape=(M,M));

    specVarGPU = tf.random_uniform((N,N), dtype=dtype);
    S_gpu = tf.random_uniform((M,N), dtype=dtype);
    V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu));
    [D2_gpu, E1_gpu,  E2_gpu] = tf.svd(specVarGPU);

with tf.device("/cpu:0"):
    init_holder_cpu = tf.placeholder(dtype, shape=(M,M));
    specVarCPU = tf.random_uniform((N,N), dtype=dtype);
    S_cpu = tf.random_uniform((M,N), dtype=dtype);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu));


    [D2_cpu, E1_cpu,  E2_cpu] = tf.svd(specVarCPU);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), E1_cpu), tf.transpose(S_cpu));

print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype));

def func(): linalg.svd(svd_array)
benchmark("numpy default SVD", func)

config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)));
sess = tf.Session(config = config);
sess.run(init_OP);

def func2(): sess.run([D2_cpu.op, E1_cpu.op,  E2_cpu.op]);
benchmark("TF CPU SVD", func2);

def func3(): sess.run([D2_gpu.op, E1_gpu.op,  E2_gpu.op]);
benchmark("TF GPU SVD", func3);

def func1(): np.transpose(np.asmatrix(another_array)).getH().dot(svd_array).dot(np.transpose(another_array));
benchmark("numpy default .dot product", func1)

def func4(): sess.run([V_cpu]);
benchmark("TF CPU matmul", func4)

def func5(): sess.run([V_gpu])
benchmark("TF GPU matmul", func4)

TensorFlow设置

python -c "import tensorflow; print(tensorflow.__version__)"
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
1.0.0

代码：

ls -l /usr/local/cuda-8.0/lib64/libcud*
-rw-r--r-- 1 root      root    556000 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudadevrt.a
lrwxrwxrwx 1 root      root        16 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so -> libcudart.so.8.0
lrwxrwxrwx 1 root      root        19 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0 -> libcudart.so.8.0.61
-rwxr-xr-x 1 root      root    415432 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0.61
-rw-r--r-- 1 root      root    775162 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart_static.a
lrwxrwxrwx 1 voldemaro users       13 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so -> libcudnn.so.5
lrwxrwxrwx 1 voldemaro users       18 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5 -> libcudnn.so.5.1.10
-rwxr-xr-x 1 voldemaro users 84163560 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5.1.10
-rw-r--r-- 1 voldemaro users 70364814 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn_static.a

'''
Created on Sep 28, 2017

@author: voldemaro

Running on I7/GTX 1080

no MKL
('TF version: ', 'v1.0.0-rc2-15-g47bba63-dirty')
('TF url: ', 'https://github.com/tensorflow/tensorflow/commit/47bba63')
Timing in ms for 2048 x 2048 SVD of type <type 'numpy.float32'> and matmul for 16920 x 2048 of type <type 'numpy.float32'>
numpy default SVD    min:  3956.20, median:  4127.75, mean:  4264.41
TF CPU SVD           min:  5926.43, median:  5951.70, mean:  5961.43
TF GPU SVD           min:  5917.10, median:  6015.87, mean:  6039.63
numpy default .dot product min:  5816.97, median:  5933.43, mean:  5965.22
TF CPU matmul        min: 21939.19, median: 22485.99, mean: 22374.69
TF GPU matmul        min: 22026.52, median: 22109.97, mean: 22199.43
'''

from scipy import linalg;  # for svd
import numpy as np;
import os;
import sys;
import time;

os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"  # nospam

import tensorflow as tf;
import gc; gc.disable();

NUM_RUNS = 5;
dtype = np.float32;
N=2048;
M =  16920;


def get_tensorflow_version_url():
    import tensorflow as tf
    version=tf.__version__
    commit = tf.__git_version__
    # commit looks like this
    # 'v1.0.0-65-g4763edf-dirty'
    commit = commit.replace("'","")
    if commit.endswith('-dirty'):
        dirty = True
        commit = commit[:-len('-dirty')]
    commit=commit.rsplit('-g', 1)[1]
    url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
    return url

def get_mkl_version():
    import ctypes
    import numpy as np
    ver = np.zeros(199, dtype=np.uint8)
    mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
    mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
    return ver[ver != 0].tostring()

timeline_counter = 0
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE);


def benchmark(message, func):
    time_list = []
    for i in range(NUM_RUNS):
        start_time = time.time();
        func();
        time_list.append(time.time()-start_time);

    time_list = 1000*np.array(time_list);  # get seconds, convert to ms
    if len(time_list)>0:
        min = np.min(time_list);
        median = np.median(time_list);
        formatted = ["%.2f"%(d,) for d in time_list[:10]];
        result = "min: %8.2f, median: %8.2f, mean: %8.2f"%(min, median, np.mean(time_list))
    else:
        result = "empty"
    print("%-20s %s"%(message, result))


if np.__config__.get_info("lapack_mkl_info"):
    print("MKL version", get_mkl_version())
else:
    print("no MKL")

print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())


svd_array = np.random.random_sample((N,N)).astype(dtype);
another_array = np.random.random_sample((M,N)).astype(dtype);

init_OP = tf.global_variables_initializer();


with tf.device("/gpu:0"):
    init_holder_gpu = tf.placeholder(dtype, shape=(M,M));

    specVarGPU = tf.random_uniform((N,N), dtype=dtype);
    S_gpu = tf.random_uniform((M,N), dtype=dtype);
    V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu));
    [D2_gpu, E1_gpu,  E2_gpu] = tf.svd(specVarGPU);

with tf.device("/cpu:0"):
    init_holder_cpu = tf.placeholder(dtype, shape=(M,M));
    specVarCPU = tf.random_uniform((N,N), dtype=dtype);
    S_cpu = tf.random_uniform((M,N), dtype=dtype);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu));


    [D2_cpu, E1_cpu,  E2_cpu] = tf.svd(specVarCPU);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), E1_cpu), tf.transpose(S_cpu));

print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype));

def func(): linalg.svd(svd_array)
benchmark("numpy default SVD", func)

config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)));
sess = tf.Session(config = config);
sess.run(init_OP);

def func2(): sess.run([D2_cpu.op, E1_cpu.op,  E2_cpu.op]);
benchmark("TF CPU SVD", func2);

def func3(): sess.run([D2_gpu.op, E1_gpu.op,  E2_gpu.op]);
benchmark("TF GPU SVD", func3);

def func1(): np.transpose(np.asmatrix(another_array)).getH().dot(svd_array).dot(np.transpose(another_array));
benchmark("numpy default .dot product", func1)

def func4(): sess.run([V_cpu]);
benchmark("TF CPU matmul", func4)

def func5(): sess.run([V_gpu])
benchmark("TF GPU matmul", func4)

“”
创建于2017年9月28日
@作者：voldemaro
在I7/GTX 1080上运行
没有MKL
（“TF版本：”，“v1.0.0-rc2-15-g47bba63-dirty”）
（'TF url:'，'https://github.com/tensorflow/tensorflow/commit/47bba63')
2048 x 2048型SVD和16920 x 2048型matmul的定时（毫秒）
numpy默认SVD最小值：3956.20，中位数：4127.75，平均值：4264.41
TF CPU SVD最小值：5926.43，中位数：5951.70，平均值：5961.43
TF GPU SVD最小值：5917.10，中位数：6015.87，平均值：6039.63
numpy默认值。点积最小值：5816.97，中位数：5933.43，平均值：5965.22
TF CPU matmul最小值：21939.19，中位数：22485.99，平均值：22374.69
TF GPU matmul最小值：22026.52，中位数：22109.97，平均值：22199.43
'''
来自scipy import linalg；#对于svd
输入numpy作为np；
导入操作系统；
导入系统；
导入时间；
os.environ[“TF_CPP_MIN_LOG_LEVEL”]=“2”nospam
导入tensorflow作为tf；
导入gc；gc.disable（）；
NUM_RUNS=5；
dtype=np.32；
N=2048；
M=16920；
def get_tensorflow_version_url（）：
导入tensorflow作为tf
版本=tf.\u版本__
提交=tf.\uuu git\u版本__
#看起来像这样
#“v1.0.0-65-g4763edf-dirty”
commit=commit.replace（“”，“”）
如果commit.endswith（'-dirty'）：
脏=真
提交=提交[：-len（'-dirty'）]
commit=commit.rsplit（'-g'，1）[1]
url='1〕https://github.com/tensorflow/tensorflow/commit/“+承诺
返回url
def get_mkl_版本（）：
导入ctypes
将numpy作为np导入
版本=np.0（199，数据类型=np.uint8）
mkl=ctypes.cdll.LoadLibrary（“libmkl\u rt.so”）
mkl.mkl_Get_Version_String（ver.ctypes.data_as（ctypes.c_char_p），198）
返回ver[ver！=0].tostring（）
时间线\u计数器=0
运行选项=tf.RunOptions（跟踪级别=tf.RunOptions.FULL\u跟踪）；
def基准测试（消息，函数）：
时间列表=[]
对于范围内的i（NUM_运行）：
开始时间=时间。时间（）；
func（）；
time\u list.append（time.time（）-start\u time）；
时间列表=1000*np.数组（时间列表）；#获取秒数，转换为毫秒
如果len（时间列表）>0：
min=np.min（时间列表）；
中位数=np.中位数（时间列表）；
格式化=[“%.2f”%（d，）表示时间列表中的d[：10]；
结果=“最小值：%8.2f，中位数：%8.2f，平均值：%8.2f”%（最小值，中位数，平均值（时间列表））
其他：
结果=“空”
打印（“%20秒%s”%（消息、结果））
如果np.\uuuuu配置\uuuuuu.get\u信息（“lapack\u mkl\u信息”）：
打印（“MKL版本”，获取MKL版本（）
其他：
打印（“无MKL”）
打印（“TF版本：”，TF.\uuu git\uuuu版本）
打印（“TF url:，获取\u tensorflow\u版本\u url（））
svd_数组=np.random.random_样本（（N，N））.astype（dtype）；
另一个数组=np.random.random_sample（（M，N））.astype（dtype）；
init_OP=tf.global_variables_initializer（）；
使用tf.device（“/gpu:0”）：
init_holder_gpu=tf.placeholder（数据类型，形状=（M，M））；
specVarGPU=tf.random_uniform（（N，N），dtype=dtype）；
S_gpu=tf.随机均匀（（M，N），dtype=dtype）；
V_gpu=tf.matmul（tf.matmul（tf.transpose（tf.transpose，tf.conj（S_gpu））），specVarGPU，），tf.transpose（S_gpu））；
[D2_gpu、E1_gpu、E2_gpu]=tf.svd（specVarGPU）；
使用tf.device（“/cpu:0”）：
init_holder_cpu=tf.placeholder（数据类型，形状=（M，M））；
specVarCPU=tf.random_uniform（（N，N），dtype=dtype）；
S_cpu=tf.随机均匀（（M，N），dtype=dtype）；
V_cpu=tf.matmul（tf.matmul（tf.transpose（tf.transpose（tf.conj（S_cpu））），specVarCPU，），tf.transpose（S_cpu））；
[D2_cpu、E1_cpu、E2_cpu]=tf.svd（specVarCPU）；
V_cpu=tf.matmul（tf.matmul（tf.transpose（tf.transpose（tf.conj（S_cpu））），E1_cpu，tf.transpose（S_cpu））；
打印（“类型%s的%d x%d SVD的计时单位为毫秒，类型%s的%d x%d的计时单位为matmul”%（N，N，dtype，M，N，dtype））；
def func（）：linalg.svd（svd_数组）
基准（“numpy默认SVD”，func）
config=tf.ConfigProto（allow\u soft\u placement=True，graph\u options=tf.GraphOptions（optimizer\u options=tf.OptimizerOptions（opt\u level=tf.OptimizerOptions.L0））；
sess=tf.Session（config=config）；
sess.run（初始操作）；
def func2（）：sess.run（[D2_cpu.op、E1_cpu.op、E2_cpu.op]）；
基准（“TF CPU SVD”，func2）；
def func3（）：sess.run（[D2_gpu.op、E1_gpu.op、E2_gpu.op]）；
基准（“TF GPU SVD”，func3）；
def func1（）：np.transpose（np.asmatrix（另一个_数组））.getH（）.dot（svd_数组）.dot（np.transpose（另一个_数组））；
基准（“numpy default.dot product”，func1）
def func4（）：sess.run（[V_cpu]）；
基准测试（“TF CPU matmul”，func4）
def func5（）：sess.run（[V_gpu]）
基准（“TF GPU matmul”，func4）

显然tensorflow不会优化“嵌套”操作，所以

tf.matmul（tf.transpose（tf.conj（a）），x）比b=tf.conj（a）、c=tf.transpose（b）和d=tf.matmul（c，x）花费的时间要长得多。

显然tensorflow并没有优化“嵌套”操作，所以

tf.matmul（tf.transpose（tf.conj（a）），x）比b=tf.conj（a）、c=tf.transpose（b）和d=tf.matmul（c，x）花费的时间要长得多。

对于SVD，问题是SVD还没有GPU内核。请看这里：

这意味着SVD必须在CPU上计算，即使张量是在GPU上实例化的。由于这个原因，将数据从GPU传输到CPU进行计算，然后再返回GPU存储结果会有开销

对于GPU上的matmul，问题出现在bechmarking代码的最后一行：您不再调用func5，而是再次调用func4，因此您正在对TF CPU matmul进行基准测试

除此之外，您可能还需要在代码中签入一些其他内容：

不需要
```
init\u holder\u cpu
```
和
```
init\u holder\u gpu
```
vars，如y所示