cuda-无法访问blockDim.x?
我正在做一个cuda程序来处理2D图像 问题是当我尝试访问blockDim.x和blockId.x时,内核总是无法启动并输出未知错误 此外,如果我使用3x5映像,我可以访问threadId.x,而使用2048x2048映像则无法访问 当我使用PyCuda时,我的内核代码运行正常,但现在我必须切换到cudac 我认为这个问题可能与 我传递数组指针的方式表明Cudamaloc有问题 配置与我的块大小和网格大小相同,但相同的配置在PyCuda中运行良好,因此我不知道如何更正它。 我使用cuda memcheck,得到未知错误30,我在谷歌上搜索解决方案,但没有有用的信息cuda-无法访问blockDim.x?,cuda,Cuda,我正在做一个cuda程序来处理2D图像 问题是当我尝试访问blockDim.x和blockId.x时,内核总是无法启动并输出未知错误 此外,如果我使用3x5映像,我可以访问threadId.x,而使用2048x2048映像则无法访问 当我使用PyCuda时,我的内核代码运行正常,但现在我必须切换到cudac 我认为这个问题可能与 我传递数组指针的方式表明Cudamaloc有问题 配置与我的块大小和网格大小相同,但相同的配置在PyCuda中运行良好,因此我不知道如何更正它。 我使用cuda mem
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
int main(int arg, char* args[])
{
// ...
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
// clean up code and processing result
}
编辑
完整代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cmath>
#include <iostream>
#include "PNG.h"
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{cout << *i << ' ';}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
PNG inPng("test.png");
// PNG outPng;
// outPng.Create(inPng.w, inPng.h);
//store width and height so we can use them for our output image later
const unsigned int w = inPng.w;
const unsigned int h = inPng.h;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(inPng.data[i*4]);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
//free the input image because we do not need it anymore
inPng.Free();
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
根据您的评论,这是为每个块定义1024个线程:
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
根据您的问题文本,在失败的情况下,w和h分别为2048,因此:
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
正在创建4097个块,正如您在注释中指出的那样
4097块1024个线程,每个线程总数为4195328个线程,但您的分配大小仅提供2048*2048个元素,或总共4194304个元素。因此,您启动的4195328个线程只有4194304个元素,剩下1024个线程
那么,那1024个额外线程做什么呢?它们仍然运行内核代码,并尝试在分配的空间之外访问调试数组
这导致C和C++中的未定义行为。
解决此问题的常规方法是将问题大小传递给内核,并在内核代码中添加线程检查,如下所示:__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int n)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
if (idx < n)
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
在完整的代码中,内核中存在非法访问问题。我已经对它进行了修改,删除了对PNG的依赖,如果我们省略了调试设置以外的内核代码,它就可以正常运行。但是,如果我们包含您的内核代码,并使用cuda memcheck运行,我们将获得各种越界访问。将来,您可以使用所述的方法调试以下各项:
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
...
========= Invalid __global__ read of size 4
========= at 0x00000418 in /home/ubuntu/bobc/misc/t146.cu:41:extractor(unsigned char const *, unsigned char*, int*, int*, int*, int, int, int, int)
========= by thread (197,0,0) in block (17,0,0)
========= Address 0x00c8b290 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5)
...
(and much more output like this)
但您正试图将其传递到设备代码:
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
^^^^
嗨,谢谢你的回复。正如您所说,我已经更改了代码,但似乎即使我执行了线程检查,也无法访问内核中的调试变量。请看我的编辑。你需要提供一个我不知道什么宽度和高度。这应该是一个完整的代码,我可以复制、粘贴和编译,而无需添加任何内容或更改任何内容。提前谢谢,我没法编译。我没有PNG.h,不管怎样,你应该提供一个最小的代码。你要把disX和disY这两个主机指针传递给设备代码。这是CUDA中的一个基本错误。
$ cat t147.cu
const int width = 2048;
const int height = 2048;
const int BLOCK_SIZE = 1024;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
// int y; int x;
// int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = 1; // get kernel launch failed "unknown error"
}
}
int main(int arg, char* args[])
{
const int w = width;
const int h = height;
const int num_sample_per_point = 1;
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
// cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t147 t147.cu
$ cuda-memcheck ./t147
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
...
========= Invalid __global__ read of size 4
========= at 0x00000418 in /home/ubuntu/bobc/misc/t146.cu:41:extractor(unsigned char const *, unsigned char*, int*, int*, int*, int, int, int, int)
========= by thread (197,0,0) in block (17,0,0)
========= Address 0x00c8b290 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5)
...
(and much more output like this)
int* disX = new int[num_sample_per_point];
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
^^^^
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
int *d_disX, *d_disY;
cudaMalloc(&d_disX, num_sample_per_point*sizeof(int));
cudaMalloc(&d_disY, num_sample_per_point*sizeof(int));
cudaMemcpy(d_disX, disX, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_disY, disY, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, d_disX, d_disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$