Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/tensorflow/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
TensorFlow中的MatMul比numpy中的点积慢_Numpy_Tensorflow - Fatal编程技术网

TensorFlow中的MatMul比numpy中的点积慢

TensorFlow中的MatMul比numpy中的点积慢,numpy,tensorflow,Numpy,Tensorflow,我观察到,在我的机器上,tensorflow中的tf.matmul运行速度明显慢于numpy中的点积。我有GTX1080GPU,希望tf.matmul至少和使用CPU(numpy)运行代码时一样快 环境信息 操作系统 lsb_release -a No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 16.10 Release: 16.10 Codename: yakkety CUDA和

我观察到,在我的机器上,tensorflow中的tf.matmul运行速度明显慢于numpy中的点积。我有GTX1080GPU,希望tf.matmul至少和使用CPU(numpy)运行代码时一样快

环境信息

操作系统

lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 16.10
Release:    16.10
Codename:   yakkety
CUDA和cuDNN的安装版本:

ls -l /usr/local/cuda-8.0/lib64/libcud*
-rw-r--r-- 1 root      root    556000 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudadevrt.a
lrwxrwxrwx 1 root      root        16 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so -> libcudart.so.8.0
lrwxrwxrwx 1 root      root        19 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0 -> libcudart.so.8.0.61
-rwxr-xr-x 1 root      root    415432 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0.61
-rw-r--r-- 1 root      root    775162 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart_static.a
lrwxrwxrwx 1 voldemaro users       13 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so -> libcudnn.so.5
lrwxrwxrwx 1 voldemaro users       18 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5 -> libcudnn.so.5.1.10
-rwxr-xr-x 1 voldemaro users 84163560 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5.1.10
-rw-r--r-- 1 voldemaro users 70364814 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn_static.a
'''
Created on Sep 28, 2017

@author: voldemaro

Running on I7/GTX 1080

no MKL
('TF version: ', 'v1.0.0-rc2-15-g47bba63-dirty')
('TF url: ', 'https://github.com/tensorflow/tensorflow/commit/47bba63')
Timing in ms for 2048 x 2048 SVD of type <type 'numpy.float32'> and matmul for 16920 x 2048 of type <type 'numpy.float32'>
numpy default SVD    min:  3956.20, median:  4127.75, mean:  4264.41
TF CPU SVD           min:  5926.43, median:  5951.70, mean:  5961.43
TF GPU SVD           min:  5917.10, median:  6015.87, mean:  6039.63
numpy default .dot product min:  5816.97, median:  5933.43, mean:  5965.22
TF CPU matmul        min: 21939.19, median: 22485.99, mean: 22374.69
TF GPU matmul        min: 22026.52, median: 22109.97, mean: 22199.43
'''

from scipy import linalg;  # for svd
import numpy as np;
import os;
import sys;
import time;

os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"  # nospam

import tensorflow as tf;
import gc; gc.disable();

NUM_RUNS = 5;
dtype = np.float32;
N=2048;
M =  16920;


def get_tensorflow_version_url():
    import tensorflow as tf
    version=tf.__version__
    commit = tf.__git_version__
    # commit looks like this
    # 'v1.0.0-65-g4763edf-dirty'
    commit = commit.replace("'","")
    if commit.endswith('-dirty'):
        dirty = True
        commit = commit[:-len('-dirty')]
    commit=commit.rsplit('-g', 1)[1]
    url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
    return url

def get_mkl_version():
    import ctypes
    import numpy as np
    ver = np.zeros(199, dtype=np.uint8)
    mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
    mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
    return ver[ver != 0].tostring()

timeline_counter = 0
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE);


def benchmark(message, func):
    time_list = []
    for i in range(NUM_RUNS):
        start_time = time.time();
        func();
        time_list.append(time.time()-start_time);

    time_list = 1000*np.array(time_list);  # get seconds, convert to ms
    if len(time_list)>0:
        min = np.min(time_list);
        median = np.median(time_list);
        formatted = ["%.2f"%(d,) for d in time_list[:10]];
        result = "min: %8.2f, median: %8.2f, mean: %8.2f"%(min, median, np.mean(time_list))
    else:
        result = "empty"
    print("%-20s %s"%(message, result))


if np.__config__.get_info("lapack_mkl_info"):
    print("MKL version", get_mkl_version())
else:
    print("no MKL")

print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())


svd_array = np.random.random_sample((N,N)).astype(dtype);
another_array = np.random.random_sample((M,N)).astype(dtype);

init_OP = tf.global_variables_initializer();


with tf.device("/gpu:0"):
    init_holder_gpu = tf.placeholder(dtype, shape=(M,M));

    specVarGPU = tf.random_uniform((N,N), dtype=dtype);
    S_gpu = tf.random_uniform((M,N), dtype=dtype);
    V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu));
    [D2_gpu, E1_gpu,  E2_gpu] = tf.svd(specVarGPU);

with tf.device("/cpu:0"):
    init_holder_cpu = tf.placeholder(dtype, shape=(M,M));
    specVarCPU = tf.random_uniform((N,N), dtype=dtype);
    S_cpu = tf.random_uniform((M,N), dtype=dtype);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu));


    [D2_cpu, E1_cpu,  E2_cpu] = tf.svd(specVarCPU);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), E1_cpu), tf.transpose(S_cpu));

print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype));

def func(): linalg.svd(svd_array)
benchmark("numpy default SVD", func)

config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)));
sess = tf.Session(config = config);
sess.run(init_OP);

def func2(): sess.run([D2_cpu.op, E1_cpu.op,  E2_cpu.op]);
benchmark("TF CPU SVD", func2);

def func3(): sess.run([D2_gpu.op, E1_gpu.op,  E2_gpu.op]);
benchmark("TF GPU SVD", func3);

def func1(): np.transpose(np.asmatrix(another_array)).getH().dot(svd_array).dot(np.transpose(another_array));
benchmark("numpy default .dot product", func1)

def func4(): sess.run([V_cpu]);
benchmark("TF CPU matmul", func4)

def func5(): sess.run([V_gpu])
benchmark("TF GPU matmul", func4)
TensorFlow设置

python -c "import tensorflow; print(tensorflow.__version__)"
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
1.0.0
代码:

ls -l /usr/local/cuda-8.0/lib64/libcud*
-rw-r--r-- 1 root      root    556000 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudadevrt.a
lrwxrwxrwx 1 root      root        16 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so -> libcudart.so.8.0
lrwxrwxrwx 1 root      root        19 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0 -> libcudart.so.8.0.61
-rwxr-xr-x 1 root      root    415432 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0.61
-rw-r--r-- 1 root      root    775162 Feb 22  2017 /usr/local/cuda-8.0/lib64/libcudart_static.a
lrwxrwxrwx 1 voldemaro users       13 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so -> libcudnn.so.5
lrwxrwxrwx 1 voldemaro users       18 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5 -> libcudnn.so.5.1.10
-rwxr-xr-x 1 voldemaro users 84163560 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5.1.10
-rw-r--r-- 1 voldemaro users 70364814 Nov  6  2016 /usr/local/cuda-8.0/lib64/libcudnn_static.a
'''
Created on Sep 28, 2017

@author: voldemaro

Running on I7/GTX 1080

no MKL
('TF version: ', 'v1.0.0-rc2-15-g47bba63-dirty')
('TF url: ', 'https://github.com/tensorflow/tensorflow/commit/47bba63')
Timing in ms for 2048 x 2048 SVD of type <type 'numpy.float32'> and matmul for 16920 x 2048 of type <type 'numpy.float32'>
numpy default SVD    min:  3956.20, median:  4127.75, mean:  4264.41
TF CPU SVD           min:  5926.43, median:  5951.70, mean:  5961.43
TF GPU SVD           min:  5917.10, median:  6015.87, mean:  6039.63
numpy default .dot product min:  5816.97, median:  5933.43, mean:  5965.22
TF CPU matmul        min: 21939.19, median: 22485.99, mean: 22374.69
TF GPU matmul        min: 22026.52, median: 22109.97, mean: 22199.43
'''

from scipy import linalg;  # for svd
import numpy as np;
import os;
import sys;
import time;

os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"  # nospam

import tensorflow as tf;
import gc; gc.disable();

NUM_RUNS = 5;
dtype = np.float32;
N=2048;
M =  16920;


def get_tensorflow_version_url():
    import tensorflow as tf
    version=tf.__version__
    commit = tf.__git_version__
    # commit looks like this
    # 'v1.0.0-65-g4763edf-dirty'
    commit = commit.replace("'","")
    if commit.endswith('-dirty'):
        dirty = True
        commit = commit[:-len('-dirty')]
    commit=commit.rsplit('-g', 1)[1]
    url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
    return url

def get_mkl_version():
    import ctypes
    import numpy as np
    ver = np.zeros(199, dtype=np.uint8)
    mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
    mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
    return ver[ver != 0].tostring()

timeline_counter = 0
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE);


def benchmark(message, func):
    time_list = []
    for i in range(NUM_RUNS):
        start_time = time.time();
        func();
        time_list.append(time.time()-start_time);

    time_list = 1000*np.array(time_list);  # get seconds, convert to ms
    if len(time_list)>0:
        min = np.min(time_list);
        median = np.median(time_list);
        formatted = ["%.2f"%(d,) for d in time_list[:10]];
        result = "min: %8.2f, median: %8.2f, mean: %8.2f"%(min, median, np.mean(time_list))
    else:
        result = "empty"
    print("%-20s %s"%(message, result))


if np.__config__.get_info("lapack_mkl_info"):
    print("MKL version", get_mkl_version())
else:
    print("no MKL")

print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())


svd_array = np.random.random_sample((N,N)).astype(dtype);
another_array = np.random.random_sample((M,N)).astype(dtype);

init_OP = tf.global_variables_initializer();


with tf.device("/gpu:0"):
    init_holder_gpu = tf.placeholder(dtype, shape=(M,M));

    specVarGPU = tf.random_uniform((N,N), dtype=dtype);
    S_gpu = tf.random_uniform((M,N), dtype=dtype);
    V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu));
    [D2_gpu, E1_gpu,  E2_gpu] = tf.svd(specVarGPU);

with tf.device("/cpu:0"):
    init_holder_cpu = tf.placeholder(dtype, shape=(M,M));
    specVarCPU = tf.random_uniform((N,N), dtype=dtype);
    S_cpu = tf.random_uniform((M,N), dtype=dtype);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu));


    [D2_cpu, E1_cpu,  E2_cpu] = tf.svd(specVarCPU);
    V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), E1_cpu), tf.transpose(S_cpu));

print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype));

def func(): linalg.svd(svd_array)
benchmark("numpy default SVD", func)

config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)));
sess = tf.Session(config = config);
sess.run(init_OP);

def func2(): sess.run([D2_cpu.op, E1_cpu.op,  E2_cpu.op]);
benchmark("TF CPU SVD", func2);

def func3(): sess.run([D2_gpu.op, E1_gpu.op,  E2_gpu.op]);
benchmark("TF GPU SVD", func3);

def func1(): np.transpose(np.asmatrix(another_array)).getH().dot(svd_array).dot(np.transpose(another_array));
benchmark("numpy default .dot product", func1)

def func4(): sess.run([V_cpu]);
benchmark("TF CPU matmul", func4)

def func5(): sess.run([V_gpu])
benchmark("TF GPU matmul", func4)
“”
创建于2017年9月28日
@作者:voldemaro
在I7/GTX 1080上运行
没有MKL
(“TF版本:”,“v1.0.0-rc2-15-g47bba63-dirty”)
('TF url:','https://github.com/tensorflow/tensorflow/commit/47bba63')
2048 x 2048型SVD和16920 x 2048型matmul的定时(毫秒)
numpy默认SVD最小值:3956.20,中位数:4127.75,平均值:4264.41
TF CPU SVD最小值:5926.43,中位数:5951.70,平均值:5961.43
TF GPU SVD最小值:5917.10,中位数:6015.87,平均值:6039.63
numpy默认值。点积最小值:5816.97,中位数:5933.43,平均值:5965.22
TF CPU matmul最小值:21939.19,中位数:22485.99,平均值:22374.69
TF GPU matmul最小值:22026.52,中位数:22109.97,平均值:22199.43
'''
来自scipy import linalg;#对于svd
输入numpy作为np;
导入操作系统;
导入系统;
导入时间;
os.environ[“TF_CPP_MIN_LOG_LEVEL”]=“2”nospam
导入tensorflow作为tf;
导入gc;gc.disable();
NUM_RUNS=5;
dtype=np.32;
N=2048;
M=16920;
def get_tensorflow_version_url():
导入tensorflow作为tf
版本=tf.\u版本__
提交=tf.\uuu git\u版本__
#看起来像这样
#“v1.0.0-65-g4763edf-dirty”
commit=commit.replace(“”,“”)
如果commit.endswith('-dirty'):
脏=真
提交=提交[:-len('-dirty')]
commit=commit.rsplit('-g',1)[1]
url='1〕https://github.com/tensorflow/tensorflow/commit/“+承诺
返回url
def get_mkl_版本():
导入ctypes
将numpy作为np导入
版本=np.0(199,数据类型=np.uint8)
mkl=ctypes.cdll.LoadLibrary(“libmkl\u rt.so”)
mkl.mkl_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p),198)
返回ver[ver!=0].tostring()
时间线\u计数器=0
运行选项=tf.RunOptions(跟踪级别=tf.RunOptions.FULL\u跟踪);
def基准测试(消息,函数):
时间列表=[]
对于范围内的i(NUM_运行):
开始时间=时间。时间();
func();
time\u list.append(time.time()-start\u time);
时间列表=1000*np.数组(时间列表);#获取秒数,转换为毫秒
如果len(时间列表)>0:
min=np.min(时间列表);
中位数=np.中位数(时间列表);
格式化=[“%.2f”%(d,)表示时间列表中的d[:10];
结果=“最小值:%8.2f,中位数:%8.2f,平均值:%8.2f”%(最小值,中位数,平均值(时间列表))
其他:
结果=“空”
打印(“%20秒%s”%(消息、结果))
如果np.\uuuuu配置\uuuuuu.get\u信息(“lapack\u mkl\u信息”):
打印(“MKL版本”,获取MKL版本()
其他:
打印(“无MKL”)
打印(“TF版本:”,TF.\uuu git\uuuu版本)
打印(“TF url:,获取\u tensorflow\u版本\u url())
svd_数组=np.random.random_样本((N,N)).astype(dtype);
另一个数组=np.random.random_sample((M,N)).astype(dtype);
init_OP=tf.global_variables_initializer();
使用tf.device(“/gpu:0”):
init_holder_gpu=tf.placeholder(数据类型,形状=(M,M));
specVarGPU=tf.random_uniform((N,N),dtype=dtype);
S_gpu=tf.随机均匀((M,N),dtype=dtype);
V_gpu=tf.matmul(tf.matmul(tf.transpose(tf.transpose,tf.conj(S_gpu))),specVarGPU,),tf.transpose(S_gpu));
[D2_gpu、E1_gpu、E2_gpu]=tf.svd(specVarGPU);
使用tf.device(“/cpu:0”):
init_holder_cpu=tf.placeholder(数据类型,形状=(M,M));
specVarCPU=tf.random_uniform((N,N),dtype=dtype);
S_cpu=tf.随机均匀((M,N),dtype=dtype);
V_cpu=tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))),specVarCPU,),tf.transpose(S_cpu));
[D2_cpu、E1_cpu、E2_cpu]=tf.svd(specVarCPU);
V_cpu=tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))),E1_cpu,tf.transpose(S_cpu));
打印(“类型%s的%d x%d SVD的计时单位为毫秒,类型%s的%d x%d的计时单位为matmul”%(N,N,dtype,M,N,dtype));
def func():linalg.svd(svd_数组)
基准(“numpy默认SVD”,func)
config=tf.ConfigProto(allow\u soft\u placement=True,graph\u options=tf.GraphOptions(optimizer\u options=tf.OptimizerOptions(opt\u level=tf.OptimizerOptions.L0));
sess=tf.Session(config=config);
sess.run(初始操作);
def func2():sess.run([D2_cpu.op、E1_cpu.op、E2_cpu.op]);
基准(“TF CPU SVD”,func2);
def func3():sess.run([D2_gpu.op、E1_gpu.op、E2_gpu.op]);
基准(“TF GPU SVD”,func3);
def func1():np.transpose(np.asmatrix(另一个_数组)).getH().dot(svd_数组).dot(np.transpose(另一个_数组));
基准(“numpy default.dot product”,func1)
def func4():sess.run([V_cpu]);
基准测试(“TF CPU matmul”,func4)
def func5():sess.run([V_gpu])
基准(“TF GPU matmul”,func4)

显然tensorflow不会优化“嵌套”操作,所以
tf.matmul(tf.transpose(tf.conj(a)),x)比b=tf.conj(a)、c=tf.transpose(b)和d=tf.matmul(c,x)花费的时间要长得多。

显然tensorflow并没有优化“嵌套”操作,所以
tf.matmul(tf.transpose(tf.conj(a)),x)比b=tf.conj(a)、c=tf.transpose(b)和d=tf.matmul(c,x)花费的时间要长得多。

对于SVD,问题是SVD还没有GPU内核。请看这里:

这意味着SVD必须在CPU上计算,即使张量是在GPU上实例化的。由于这个原因,将数据从GPU传输到CPU进行计算,然后再返回GPU存储结果会有开销

对于GPU上的matmul,问题出现在bechmarking代码的最后一行:您不再调用func5,而是再次调用func4,因此您正在对TF CPU matmul进行基准测试

除此之外,您可能还需要在代码中签入一些其他内容:

  • 不需要
    init\u holder\u cpu
    init\u holder\u gpu
    vars,如y所示