浮动4不比cuda中的浮动快
编辑:njuffa是正确的此版本是使用-G编译的,它禁用了所有优化。随着加载和存储的矢量化,新的SASS速度更快 基于经典示例,我在cuda中修改了向量加法的两个版本。问题是,float4版本的长度是float版本的两倍,数据量减少了4倍。对这两个内核的分析清楚地表明,float4版本平均每个事务执行4个加载和4个存储,而float版本仅对这两个事务执行一个加载和4个存储。这听起来像是一个关于浮动4访问不一致的noob问题,顺便说一句,下面的PTX确认了这一点,但我找不到位置 我正在使用Cuda 7.0 rc和quadro K4000 你知道去哪里找吗 编译选项浮动4不比cuda中的浮动快,c,cuda,C,Cuda,编辑:njuffa是正确的此版本是使用-G编译的,它禁用了所有优化。随着加载和存储的矢量化,新的SASS速度更快 基于经典示例,我在cuda中修改了向量加法的两个版本。问题是,float4版本的长度是float版本的两倍,数据量减少了4倍。对这两个内核的分析清楚地表明,float4版本平均每个事务执行4个加载和4个存储,而float版本仅对这两个事务执行一个加载和4个存储。这听起来像是一个关于浮动4访问不一致的noob问题,顺便说一句,下面的PTX确认了这一点,但我找不到位置 我正在使用Cuda
__aligned__ keyword ?
__global__ void add_float(float *c, const float *a, const float *b)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void add_float4(float4 *c, const float4 *a, const float4 *b) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
float4 a1 = a[i];
float4 b1 = b[i];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
c[i] = c1;
}
线路上的PTX:
float4 a1 = a[i];
说:
SASS objdump说:
/*0108*/ MOV R10, R0; /* 0x2800000000029de4 */
/*0110*/ ISET.LT.AND R11, R0, RZ, PT; /* 0x108e0000fc02dc23 */
/*0118*/ MOV32I R13, 0x4; /* 0x1800000010035de2 */
/*0120*/ ISETP.LE.U32.AND P0, PT, R13, 0x20, PT; /* 0x198ec00080d1dc03 */
/*0128*/ ISUB R12, 0x20, R13; /* 0x4800c00080d31e03 */
/*0130*/ SHL R11, R11, R13; /* 0x6000000034b2dc03 */
/*0138*/ SHR.U32 R14, R10, R12; /* 0x5800000030a39c03 */
/* 0x22c2804282328047 */
/*0148*/ IADD R11, R11, R14; /* 0x4800000038b2dc03 */
/*0150*/ @!P0 IADD R12, R13, -0x20; /* 0x4800ffff80d32003 */
/*0158*/ @!P0 SHL R11, R10, R12; /* 0x6000000030a2e003 */
/*0160*/ SHL R10, R10, R13; /* 0x6000000034a29c03 */
/*0168*/ MOV R10, R10; /* 0x2800000028029de4 */
/*0170*/ MOV R11, R11; /* 0x280000002c02dde4 */
/*0178*/ IADD R8.CC, R8, R10; /* 0x4801000028821c03 */
/* 0x228042c042828047 */
/*0188*/ IADD.X R9, R9, R11; /* 0x480000002c925c43 */
/*0190*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0198*/ MOV R9, R9; /* 0x2800000024025de4 */
/*01a0*/ LD.E R10, [R8]; /* 0x8400000000829c85 */
/*01a8*/ IADD R12.CC, R8, 0x4; /* 0x4801c00010831c03 */
/*01b0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01b8*/ MOV R12, R12; /* 0x2800000030031de4 */
/* 0x2202828042c2e287 */
/*01c8*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01d0*/ LD.E R11, [R12]; /* 0x8400000000c2dc85 */
/*01d8*/ IADD R12.CC, R8, 0x8; /* 0x4801c00020831c03 */
/*01e0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01e8*/ MOV R12, R12; /* 0x2800000030031de4 */
/*01f0*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01f8*/ LD.E R12, [R12]; /* 0x8400000000c31c85 */
/* 0x2282c202828042c7 */
/*0208*/ IADD R8.CC, R8, 0xc; /* 0x4801c00030821c03 */
/*0210*/ IADD.X R9, R9, RZ; /* 0x48000000fc925c43 */
/*0218*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0220*/ MOV R9, R9; /* 0x2800000024025de4 */
/*0228*/ LD.E R8, [R8]; /* 0x8400000000821c85 */
/*0230*/ IADD R14.CC, R2, 0xc; /* 0x4801c00030239c03 */
/*0238*/ IADD.X R15, R3, RZ; /* 0x48000000fc33dc43 */
/* 0x22828042c2e28047 */
/*0248*/ MOV R14, R14; /* 0x2800000038039de4 */
/*0250*/ MOV R15, R15; /* 0x280000003c03dde4 */
/*0258*/ ST.E [R14], R8; /* 0x9400000000e21c85 */
/*0260*/ IADD R8.CC, R2, 0x8; /* 0x4801c00020221c03 */
/*0268*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*0270*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0278*/ MOV R9, R9; /* 0x2800000024025de4 */
/* 0x22c2e2828042c2e7 */
/*0288*/ ST.E [R8], R12; /* 0x9400000000831c85 */
/*0290*/ IADD R8.CC, R2, 0x4; /* 0x4801c00010221c03 */
/*0298*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02a0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02a8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02b0*/ ST.E [R8], R11; /* 0x940000000082dc85 */
/*02b8*/ IADD R8.CC, R2, RZ; /* 0x48010000fc221c03 */
/* 0x22820042e2828047 */
/*02c8*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02d0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02d8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02e0*/ ST.E [R8], R10; /* 0x9400000000829c85 */
以下是剩下的:
void CudaTest()
{
int size = 8192;
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
float *host_a = (float*)malloc(4 * size * sizeof(float));
float *host_b = (float*)malloc(4 * size * sizeof(float));
float *host_c = (float*)malloc(4 * size * sizeof(float));
float4 *dev_a4 = 0;
float4 *dev_b4 = 0;
float4 *dev_c4 = 0;
float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
float4 *host_c4 = (float4*)malloc(size * sizeof(float4));
for (int i = 0; i < 4 * size; i++)
{
host_a[i] = rand() / RAND_MAX;
host_b[i] = rand() / RAND_MAX;
}
for (int i = 0; i < size; i++)
{
host_a4[i].x = rand() / RAND_MAX;
host_a4[i].y = rand() / RAND_MAX;
host_a4[i].z = rand() / RAND_MAX;
host_a4[i].w = rand() / RAND_MAX;
host_b4[i].x = rand() / RAND_MAX;
host_b4[i].y = rand() / RAND_MAX;
host_b4[i].z = rand() / RAND_MAX;
host_b4[i].w = rand() / RAND_MAX;
}
// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_CALL(cudaSetDevice(0));
// Allocate GPU buffers for three vectors (two input, one output) .
CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));
// Copy input vectors from host memory to GPU buffers.
CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));
int local = 256;
int N = size / local;
// Launch a kernel on the GPU with one thread for each element.
add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
CUDA_CALL(cudaDeviceSynchronize());
// Copy output vector from GPU buffer to host memory.
CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}
void CudaTest()
{
int size=8192;
浮动*dev_a=0;
浮动*dev_b=0;
浮点数*dev_c=0;
float*host_a=(float*)malloc(4*size*sizeof(float));
float*host_b=(float*)malloc(4*size*sizeof(float));
float*host_c=(float*)malloc(4*size*sizeof(float));
浮动4*dev_a4=0;
浮动4*dev_b4=0;
浮动4*dev_c4=0;
float4*host_a4=(float4*)malloc(size*sizeof(float4));
float4*host_b4=(float4*)malloc(size*sizeof(float4));
float4*host_c4=(float4*)malloc(size*sizeof(float4));
对于(int i=0;i<4*size;i++)
{
host_a[i]=rand()/rand_MAX;
host_b[i]=rand()/rand_MAX;
}
对于(int i=0;i(开发c、开发a、开发b);
//检查启动内核时是否有任何错误
CUDA_调用(cudaGetLastError());
添加4>(开发c4、开发a4、开发b4);
//检查启动内核时是否有任何错误
CUDA_调用(cudaGetLastError());
//cudaDeviceSynchronize等待内核完成,然后返回
//在启动过程中遇到的任何错误。
CUDA_调用(cudaDeviceSynchronize());
//将输出向量从GPU缓冲区复制到主机内存。
CUDA_调用(cudaMemcpy(主机c、开发c、4*size*sizeof(浮点)、cudaMemcpyDeviceToHost));
CUDA_调用(cudaMemcpy(主机c4、开发c4、大小*sizeof(浮动4)、cudaMemcpyDeviceToHost));
}
使用GPU硬件提供的矢量加载/存储指令被认为是编译器应用的性能优化,因为代码使用标量加载和存储是完全功能的。当代码由nvcc
和-G
编译时(通常用于调试),所有优化(包括加载和存储的矢量化)都将关闭
要检查加载/存储矢量化,重要的是查看正在执行的实际机器代码(SASS),而不是PTX,PTX只是一个中间代码,由名为
ptxas
的优化编译器组件编译成SASS,该组件由驱动程序nvcc
调用。在nvcc生成的可执行文件上运行cuobjdump--dump sass
,检查机器代码。您使用什么目标体系结构和CUDA版本来生成所显示的PTX?在CUDA 6.5上,我得到了预期的矢量化加载和存储指令。PTX只是一种中间格式。要检查矢量化加载和存储是否存在,请检查使用cuobjdump--dump SASS
提取的机器代码(SASS)。我已经为Njuffa添加了SASS objdump。泰龙,我不知道你在说什么。我正在使用cuda 7.0为K4000 quadro进行构建。nvcc的标志是compute_30,sm_30。上面的SASS代码表明您正在使用-G
标志进行编译,这会导致禁用所有优化(包括矢量化)。感谢Njuffa指出这一点。现在它的工作原理与预期一致。
void CudaTest()
{
int size = 8192;
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
float *host_a = (float*)malloc(4 * size * sizeof(float));
float *host_b = (float*)malloc(4 * size * sizeof(float));
float *host_c = (float*)malloc(4 * size * sizeof(float));
float4 *dev_a4 = 0;
float4 *dev_b4 = 0;
float4 *dev_c4 = 0;
float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
float4 *host_c4 = (float4*)malloc(size * sizeof(float4));
for (int i = 0; i < 4 * size; i++)
{
host_a[i] = rand() / RAND_MAX;
host_b[i] = rand() / RAND_MAX;
}
for (int i = 0; i < size; i++)
{
host_a4[i].x = rand() / RAND_MAX;
host_a4[i].y = rand() / RAND_MAX;
host_a4[i].z = rand() / RAND_MAX;
host_a4[i].w = rand() / RAND_MAX;
host_b4[i].x = rand() / RAND_MAX;
host_b4[i].y = rand() / RAND_MAX;
host_b4[i].z = rand() / RAND_MAX;
host_b4[i].w = rand() / RAND_MAX;
}
// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_CALL(cudaSetDevice(0));
// Allocate GPU buffers for three vectors (two input, one output) .
CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));
// Copy input vectors from host memory to GPU buffers.
CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));
int local = 256;
int N = size / local;
// Launch a kernel on the GPU with one thread for each element.
add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
CUDA_CALL(cudaDeviceSynchronize());
// Copy output vector from GPU buffer to host memory.
CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}