Cuda将双精度降级为浮点错误,尽管代码中没有双精度
我正在用PyCUDA编写一个内核。我的GPU设备只支持compute capability 1.1(arch sm_11),因此我只能在代码中使用浮点。我已经尽了很大的努力来确保我所做的每一件事都是使用浮点数,但尽管如此,我的代码中有一行代码一直在导致编译器错误 代码块是:Cuda将双精度降级为浮点错误,尽管代码中没有双精度,cuda,gpu,pycuda,Cuda,Gpu,Pycuda,我正在用PyCUDA编写一个内核。我的GPU设备只支持compute capability 1.1(arch sm_11),因此我只能在代码中使用浮点。我已经尽了很大的努力来确保我所做的每一件事都是使用浮点数,但尽管如此,我的代码中有一行代码一直在导致编译器错误 代码块是: // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. if( j > 0 && j < im_
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
所有这些变化都会产生相同的错误:
pycuda.driver.CompileError: nvcc said it demoted types in source code it compiled--this is likely not what you want.
[command: nvcc --cubin -arch sm_11 -I/usr/local/lib/python2.7/dist-packages/pycuda-2011.1.2-py2.7-linux-x86_64.egg/pycuda/../include/pycuda kernel.cu]
[stderr:
ptxas /tmp/tmpxft_00004329_00000000-2_kernel.ptx, line 128; warning : Double is not supported. Demoting to float
]
有什么想法吗?我已经调试了我代码中的许多错误,并希望它今晚能正常工作,但事实证明这是一个我无法理解的错误
添加了——这里是内核的一个截断版本,它在我的机器上产生了上述相同的错误
every_pixel_hog_kernel_source = \
"""
#include <math.h>
#include <stdio.h>
__device__ int idx(int ii, int jj){
return gridDim.x*blockDim.x*ii+jj;
}
__device__ int bin_number(float angle_val, int total_angles, int num_bins){
float angle1;
float min_dist;
float this_dist;
int bin_indx;
angle1 = 0.0;
min_dist = abs(angle_val - angle1);
bin_indx = 0;
for(int kk=1; kk < num_bins; kk++){
angle1 = angle1 + float(total_angles)/float(num_bins);
this_dist = abs(angle_val - angle1);
if(this_dist < min_dist){
min_dist = this_dist;
bin_indx = kk;
}
}
return bin_indx;
}
__device__ int hist_number(int ii, int jj){
int hist_num = 0;
if(jj >= 0 && jj < 11){
if(ii >= 0 && ii < 11){
hist_num = 0;
}
else if(ii >= 11 && ii < 22){
hist_num = 3;
}
else if(ii >= 22 && ii < 33){
hist_num = 6;
}
}
else if(jj >= 11 && jj < 22){
if(ii >= 0 && ii < 11){
hist_num = 1;
}
else if(ii >= 11 && ii < 22){
hist_num = 4;
}
else if(ii >= 22 && ii < 33){
hist_num = 7;
}
}
else if(jj >= 22 && jj < 33){
if(ii >= 0 && ii < 11){
hist_num = 2;
}
else if(ii >= 11 && ii < 22){
hist_num = 5;
}
else if(ii >= 22 && ii < 33){
hist_num = 8;
}
}
return hist_num;
}
__global__ void every_pixel_hog_kernel(float* input_image, int im_width, int im_height, float* gaussian_array, float* x_gradient, float* y_gradient, float* gradient_mag, float* angles, float* output_array)
{
/////
// Setup the thread indices and linear offset.
/////
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int ang_limit = 180;
int ang_bins = 9;
float pi_val = 3.141592653589f; //91
/////
// Compute a Gaussian smoothing of the current pixel and save it into a new image array
// Use sync threads to make sure everyone does the Gaussian smoothing before moving on.
/////
if( j > 1 && i > 1 && j < im_width-2 && i < im_height-2 ){
// Hard-coded unit standard deviation 5-by-5 Gaussian smoothing filter.
gaussian_array[idx(i,j)] = float(1.0/273.0) *(
input_image[idx(i-2,j-2)] + float(4.0)*input_image[idx(i-2,j-1)] + float(7.0)*input_image[idx(i-2,j)] + float(4.0)*input_image[idx(i-2,j+1)] + input_image[idx(i-2,j+2)] +
float(4.0)*input_image[idx(i-1,j-2)] + float(16.0)*input_image[idx(i-1,j-1)] + float(26.0)*input_image[idx(i-1,j)] + float(16.0)*input_image[idx(i-1,j+1)] + float(4.0)*input_image[idx(i-1,j+2)] +
float(7.0)*input_image[idx(i,j-2)] + float(26.0)*input_image[idx(i,j-1)] + float(41.0)*input_image[idx(i,j)] + float(26.0)*input_image[idx(i,j+1)] + float(7.0)*input_image[idx(i,j+2)] +
float(4.0)*input_image[idx(i+1,j-2)] + float(16.0)*input_image[idx(i+1,j-1)] + float(26.0)*input_image[idx(i+1,j)] + float(16.0)*input_image[idx(i+1,j+1)] + float(4.0)*input_image[idx(i+1,j+2)] +
input_image[idx(i+2,j-2)] + float(4.0)*input_image[idx(i+2,j-1)] + float(7.0)*input_image[idx(i+2,j)] + float(4.0)*input_image[idx(i+2,j+1)] + input_image[idx(i+2,j+2)]);
}
__syncthreads();
/////
// Compute the simple x and y gradients of the image and store these into new images
// again using syncthreads before moving on.
/////
// X-gradient, ensure x is between 1 and width-1
if( j > 0 && j < im_width){
x_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i,j-1)]);
}
else if(j == 0){
x_gradient[idx(i,j)] = float(0.0);
}
// Y-gradient, ensure y is between 1 and height-1
if( i > 0 && i < im_height){
y_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i-1,j)]);
}
else if(i == 0){
y_gradient[idx(i,j)] = float(0.0);
}
__syncthreads();
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j < im_width && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
__syncthreads();
/////
// Compute the orientation angles
/////
if( j < im_width && i < im_height){
if(ang_limit == 360){
angles[idx(i,j)] = float((atan2(y_gradient[idx(i,j)],x_gradient[idx(i,j)])+pi_val)*float(180.0)/pi_val);
}
else{
angles[idx(i,j)] = float((atan( y_gradient[idx(i,j)]/x_gradient[idx(i,j)] )+(pi_val/float(2.0)))*float(180.0)/pi_val);
}
}
__syncthreads();
// Compute the HoG using the above arrays. Do so in a 3x3 grid, with 9 angle bins for each grid.
// forming an 81-vector and then write this 81 vector as a row in the large output array.
int top_bound, bot_bound, left_bound, right_bound, offset;
int window = 32;
if(i-window/2 > 0){
top_bound = i-window/2;
bot_bound = top_bound + window;
}
else{
top_bound = 0;
bot_bound = top_bound + window;
}
if(j-window/2 > 0){
left_bound = j-window/2;
right_bound = left_bound + window;
}
else{
left_bound = 0;
right_bound = left_bound + window;
}
if(bot_bound - im_height > 0){
offset = bot_bound - im_height;
top_bound = top_bound - offset;
bot_bound = bot_bound - offset;
}
if(right_bound - im_width > 0){
offset = right_bound - im_width;
right_bound = right_bound - offset;
left_bound = left_bound - offset;
}
int counter_i = 0;
int counter_j = 0;
int bin_indx, hist_indx, glob_col_indx, glob_row_indx;
int row_width = 81;
for(int pix_i = top_bound; pix_i < bot_bound; pix_i++){
for(int pix_j = left_bound; pix_j < right_bound; pix_j++){
bin_indx = bin_number(angles[idx(pix_i,pix_j)], ang_limit, ang_bins);
hist_indx = hist_number(counter_i,counter_j);
glob_col_indx = ang_bins*hist_indx + bin_indx;
glob_row_indx = idx(i,j);
output_array[glob_row_indx*row_width + glob_col_indx] = float(output_array[glob_row_indx*row_width + glob_col_indx] + float(gradient_mag[idx(pix_i,pix_j)]));
counter_j = counter_j + 1;
}
counter_i = counter_i + 1;
counter_j = 0;
}
}
"""
every_pixel_hog_kernel_source=\
"""
#包括
#包括
__设备_uu_uu_uu_uuu_uu_uu_u_uu_u_uu_uu_uu;int idx(int ii,int jj){
返回gridDim.x*blockDim.x*ii+jj;
}
__设备编号(浮动角度值、总角度值、数值){
浮动角度1;
浮动最小距离;
浮动此区域;
int bin_indx;
角度1=0.0;
最小距离=绝对值(角度值-角度1);
bin_indx=0;
对于(int kk=1;kk=0&&jj<11){
如果(ii>=0&&ii<11){
hist_num=0;
}
否则,如果(ii>=11&&ii<22){
hist_num=3;
}
否则,如果(ii>=22&&ii<33){
hist_num=6;
}
}
否则如果(jj>=11&&jj<22){
如果(ii>=0&&ii<11){
hist_num=1;
}
否则,如果(ii>=11&&ii<22){
hist_num=4;
}
否则,如果(ii>=22&&ii<33){
历史数字=7;
}
}
否则如果(jj>=22&&jj<33){
如果(ii>=0&&ii<11){
hist_num=2;
}
否则,如果(ii>=11&&ii<22){
hist_num=5;
}
否则,如果(ii>=22&&ii<33){
hist_num=8;
}
}
返回hist_num;
}
__全局\uuuuuu无效每个\u像素\uuHog\u内核(浮点*输入\u图像、整数im\u宽度、整数im\u高度、浮点*高斯\u数组、浮点*x\u渐变、浮点*y\u渐变、浮点*渐变、浮点*mag、浮点*角度、浮点*输出\u数组)
{
/////
//设置螺纹索引和线性偏移。
/////
int i=blockDim.y*blockIdx.y+threadIdx.y;
int j=blockDim.x*blockIdx.x+threadIdx.x;
int ang_极限=180;
int ang_bins=9;
浮点pi_val=3.141592653589f;//91
/////
//计算当前像素的高斯平滑并将其保存到新的图像数组中
//使用同步线程确保每个人在继续之前都进行高斯平滑。
/////
如果(j>1&&i>1&&j0&&j0&&i
(注释,不是答案,但太大,无法作为注释)
你能提供l周围的PTX代码吗
every_pixel_hog_kernel_source = \
"""
#include <math.h>
#include <stdio.h>
__device__ int idx(int ii, int jj){
return gridDim.x*blockDim.x*ii+jj;
}
__device__ int bin_number(float angle_val, int total_angles, int num_bins){
float angle1;
float min_dist;
float this_dist;
int bin_indx;
angle1 = 0.0;
min_dist = abs(angle_val - angle1);
bin_indx = 0;
for(int kk=1; kk < num_bins; kk++){
angle1 = angle1 + float(total_angles)/float(num_bins);
this_dist = abs(angle_val - angle1);
if(this_dist < min_dist){
min_dist = this_dist;
bin_indx = kk;
}
}
return bin_indx;
}
__device__ int hist_number(int ii, int jj){
int hist_num = 0;
if(jj >= 0 && jj < 11){
if(ii >= 0 && ii < 11){
hist_num = 0;
}
else if(ii >= 11 && ii < 22){
hist_num = 3;
}
else if(ii >= 22 && ii < 33){
hist_num = 6;
}
}
else if(jj >= 11 && jj < 22){
if(ii >= 0 && ii < 11){
hist_num = 1;
}
else if(ii >= 11 && ii < 22){
hist_num = 4;
}
else if(ii >= 22 && ii < 33){
hist_num = 7;
}
}
else if(jj >= 22 && jj < 33){
if(ii >= 0 && ii < 11){
hist_num = 2;
}
else if(ii >= 11 && ii < 22){
hist_num = 5;
}
else if(ii >= 22 && ii < 33){
hist_num = 8;
}
}
return hist_num;
}
__global__ void every_pixel_hog_kernel(float* input_image, int im_width, int im_height, float* gaussian_array, float* x_gradient, float* y_gradient, float* gradient_mag, float* angles, float* output_array)
{
/////
// Setup the thread indices and linear offset.
/////
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int ang_limit = 180;
int ang_bins = 9;
float pi_val = 3.141592653589f; //91
/////
// Compute a Gaussian smoothing of the current pixel and save it into a new image array
// Use sync threads to make sure everyone does the Gaussian smoothing before moving on.
/////
if( j > 1 && i > 1 && j < im_width-2 && i < im_height-2 ){
// Hard-coded unit standard deviation 5-by-5 Gaussian smoothing filter.
gaussian_array[idx(i,j)] = float(1.0/273.0) *(
input_image[idx(i-2,j-2)] + float(4.0)*input_image[idx(i-2,j-1)] + float(7.0)*input_image[idx(i-2,j)] + float(4.0)*input_image[idx(i-2,j+1)] + input_image[idx(i-2,j+2)] +
float(4.0)*input_image[idx(i-1,j-2)] + float(16.0)*input_image[idx(i-1,j-1)] + float(26.0)*input_image[idx(i-1,j)] + float(16.0)*input_image[idx(i-1,j+1)] + float(4.0)*input_image[idx(i-1,j+2)] +
float(7.0)*input_image[idx(i,j-2)] + float(26.0)*input_image[idx(i,j-1)] + float(41.0)*input_image[idx(i,j)] + float(26.0)*input_image[idx(i,j+1)] + float(7.0)*input_image[idx(i,j+2)] +
float(4.0)*input_image[idx(i+1,j-2)] + float(16.0)*input_image[idx(i+1,j-1)] + float(26.0)*input_image[idx(i+1,j)] + float(16.0)*input_image[idx(i+1,j+1)] + float(4.0)*input_image[idx(i+1,j+2)] +
input_image[idx(i+2,j-2)] + float(4.0)*input_image[idx(i+2,j-1)] + float(7.0)*input_image[idx(i+2,j)] + float(4.0)*input_image[idx(i+2,j+1)] + input_image[idx(i+2,j+2)]);
}
__syncthreads();
/////
// Compute the simple x and y gradients of the image and store these into new images
// again using syncthreads before moving on.
/////
// X-gradient, ensure x is between 1 and width-1
if( j > 0 && j < im_width){
x_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i,j-1)]);
}
else if(j == 0){
x_gradient[idx(i,j)] = float(0.0);
}
// Y-gradient, ensure y is between 1 and height-1
if( i > 0 && i < im_height){
y_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i-1,j)]);
}
else if(i == 0){
y_gradient[idx(i,j)] = float(0.0);
}
__syncthreads();
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j < im_width && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
__syncthreads();
/////
// Compute the orientation angles
/////
if( j < im_width && i < im_height){
if(ang_limit == 360){
angles[idx(i,j)] = float((atan2(y_gradient[idx(i,j)],x_gradient[idx(i,j)])+pi_val)*float(180.0)/pi_val);
}
else{
angles[idx(i,j)] = float((atan( y_gradient[idx(i,j)]/x_gradient[idx(i,j)] )+(pi_val/float(2.0)))*float(180.0)/pi_val);
}
}
__syncthreads();
// Compute the HoG using the above arrays. Do so in a 3x3 grid, with 9 angle bins for each grid.
// forming an 81-vector and then write this 81 vector as a row in the large output array.
int top_bound, bot_bound, left_bound, right_bound, offset;
int window = 32;
if(i-window/2 > 0){
top_bound = i-window/2;
bot_bound = top_bound + window;
}
else{
top_bound = 0;
bot_bound = top_bound + window;
}
if(j-window/2 > 0){
left_bound = j-window/2;
right_bound = left_bound + window;
}
else{
left_bound = 0;
right_bound = left_bound + window;
}
if(bot_bound - im_height > 0){
offset = bot_bound - im_height;
top_bound = top_bound - offset;
bot_bound = bot_bound - offset;
}
if(right_bound - im_width > 0){
offset = right_bound - im_width;
right_bound = right_bound - offset;
left_bound = left_bound - offset;
}
int counter_i = 0;
int counter_j = 0;
int bin_indx, hist_indx, glob_col_indx, glob_row_indx;
int row_width = 81;
for(int pix_i = top_bound; pix_i < bot_bound; pix_i++){
for(int pix_j = left_bound; pix_j < right_bound; pix_j++){
bin_indx = bin_number(angles[idx(pix_i,pix_j)], ang_limit, ang_bins);
hist_indx = hist_number(counter_i,counter_j);
glob_col_indx = ang_bins*hist_indx + bin_indx;
glob_row_indx = idx(i,j);
output_array[glob_row_indx*row_width + glob_col_indx] = float(output_array[glob_row_indx*row_width + glob_col_indx] + float(gradient_mag[idx(pix_i,pix_j)]));
counter_j = counter_j + 1;
}
counter_i = counter_i + 1;
counter_j = 0;
}
}
"""
__constant__ int im_width;
__constant__ int im_height;
__device__ int idx(int i,int j) {
return i+j*im_width;
}
__global__ void kernel(float* gradient_mag, float* x_gradient, float* y_gradient) {
int i = threadIdx.x;
int j = threadIdx.y;
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
}
C:\>nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2011 NVIDIA Corporation
Built on Wed_Oct_19_23:13:02_PDT_2011
Cuda compilation tools, release 4.1, V0.2.1221
angle1 = 0.0;
gaussian_array[idx(i,j)] = float(1.0/273.0) *
import pycuda.driver as cuda
from pycuda.compiler import compile
x=compile("""put your truncated kernel code here""",options=[],arch="sm_11",keep=True)