使用CUDA内核获取堆栈溢出_C_Pointers_Cuda_Stack Overflow_Parallel Processing

使用CUDA内核获取堆栈溢出

c pointers cuda parallel-processing

使用CUDA内核获取堆栈溢出,c,pointers,cuda,stack-overflow,parallel-processing,C,Pointers,Cuda,Stack Overflow,Parallel Processing,我正在编程的代码有一个很大的问题。我不是专家，来这里之前我问过很多人。也纠正了很多事情。所以，我想我已经准备好向您展示代码并向您提问了。我将把整个代码放在这里，让您更好地理解我的问题所在。我想做的是，如果ARRAY\u SIZE对于线程大小来说太大，那么我将大数组的数据放入一个较小的数组中，这个数组是专门用SIZETHREAD\u SIZE创建的。然后，我将它发送到内核并执行我必须执行的任何操作。但我在这方面有问题 isub_matrix[x*THREAD_SIZE+y]=big_mat

我正在编程的代码有一个很大的问题。我不是专家，来这里之前我问过很多人。也纠正了很多事情。所以，我想我已经准备好向您展示代码并向您提问了。我将把整个代码放在这里，让您更好地理解我的问题所在。我想做的是，如果

ARRAY\u SIZE

对于线程大小来说太大，那么我将大数组的数据放入一个较小的数组中，这个数组是专门用SIZE

THREAD\u SIZE

创建的。然后，我将它发送到内核并执行我必须执行的任何操作。但我在这方面有问题

isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];

由于堆栈溢出，代码停止的位置。首先，我制作了一个大矩阵的双指针。但是freenode irc网络的#cuda频道的人告诉我，它太大了，CPU内存无法处理它，我应该创建一个线性指针。我做到了，但是我仍然有相同的堆栈溢出问题。所以，它来了。。。在一些更改后更新，但仍然不起作用（堆栈溢出已停止，但链接和清单更新失败）

#定义数组大小2048
#定义线程大小为32
#定义PI 3.14
int main（int argc，字符**argv）
{
int数组_plus=0，x，y；
浮动时间；
//unsigned int memsize=sizeof（float）*线程大小*线程大小；
//布尔数组_rest；
cudaEvent\u t启动、停止；
float*d_-isub_矩阵；
浮点*大矩阵=新浮点[数组大小*数组大小]；
float*big\u matrix2=新的float[数组大小*数组大小]；
浮点*isub_矩阵=新浮点[螺纹尺寸*螺纹尺寸]；
浮动*osub_矩阵=新浮动[螺纹尺寸*螺纹尺寸]；
//如果数组的大小与线程的大小不兼容，它将无法工作。
//数组大小=数组大小*数组大小/（线程大小*线程大小）；
//isub_矩阵=（浮点*）malloc（memsize）；
//osub_矩阵=（浮点*）malloc（memsize）；
if（（（数组大小*数组大小）%（线程大小*线程大小）==0））
{
//在CPU内存和GPU内存中为大矩阵及其子矩阵分配空间
//它必须是这样的（很多循环）
//填充大数组
对于（x=0；x=螺纹尺寸））
{
//在CPU内存和GPU内存中为大矩阵及其子矩阵分配空间
//它必须是这样的（很多循环）
//填充大数组
对于（x=0；x我认为问题出在这条线上
 cudaMemcpy(osub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

这是因为您在设备中同时分配了osub_矩阵
和isub_矩阵
。
我看到这段代码中存在大量问题
在将数据从big_矩阵复制到isub_矩阵之前，您没有为isub_矩阵分配内存
    for(x=0;x<THREAD_SIZE;x++)
    {
        for(y=0;y<THREAD_SIZE;y++)
            isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
    }

顺便说一句，事实并非如此
旋转因子（isub矩阵、osub矩阵）
应该是
旋转因子（d_-isub_矩阵、osub_矩阵）；
最终和完成的代码：
int main(int argc, char** argv)
{
        int array_plus=0,x,y;
        int array_plus_x, array_plus_y;
        float time;
        //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE;
        //bool array_rest;
        cudaEvent_t start,stop;
        float *d_isub_matrix,*d_osub_matrix;

        float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
        float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE];

        //if the array's size is not compatible with the thread's size, it won't work.

        //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE);
        //isub_matrix=(float*) malloc(memsize);
        //osub_matrix=(float*) malloc(memsize);

        if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE))
        {

            //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes
            //it has to be like this (lots of loops)



            //populating the big array
            for(x=0;x<ARRAY_SIZE;x++)
            {
                for(y=0;y<ARRAY_SIZE;y++)
                    big_matrix[x*ARRAY_SIZE+y]=rand()%10000;
            }

            //kind of loop for the big array

            //Start counting the time of processing (everything)
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            cudaEventRecord(start,0);
            for(array_plus_x = 0; array_plus_x < ARRAY_SIZE; array_plus_x += THREAD_SIZE)
            for(array_plus_y = 0; array_plus_y < ARRAY_SIZE; array_plus_y += THREAD_SIZE)
            {


                //putting the big array's values into the sub-matrix

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)];
                }

                cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);

                //call the cuda kernel

                dim3 block(32,32);
                twiddle_factor<<<1,block>>>(d_isub_matrix,d_osub_matrix);//<----

                cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        big_matrix2[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)]=osub_matrix[x*THREAD_SIZE+y];
                }

                cudaFree(d_osub_matrix);
                cudaFree(d_isub_matrix);
            }

            //Stop the time

            cudaEventRecord(stop,0);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&time,start,stop);

            //Free memory in GPU

int main（int argc，char**argv）
{
int数组_plus=0，x，y；
int数组加x，数组加y；
浮动时间；
//unsigned int memsize=sizeof（float）*线程大小*线程大小；
//布尔数组_rest；
cudaEvent\u t启动、停止；
浮点*d_-isub_矩阵，*d_-osub_矩阵；
浮点*大矩阵=新浮点[数组大小*数组大小]；
float*big\u matrix2=新的float[数组大小*数组大小]；
浮点*isub_矩阵=新浮点[螺纹尺寸*螺纹尺寸]；
浮动*osub_矩阵=新浮动[螺纹尺寸*螺纹尺寸]；
//如果数组的大小与线程的大小不兼容，它将无法工作。
//数组大小=数组大小*数组大小/（线程大小*线程大小）；
//isub_矩阵=（浮点*）malloc（memsize）；
//osub_矩阵=（浮点*）malloc（memsize）；
if（（（数组大小*数组大小）%（线程大小*线程大小）==0）和&（数组大小>=线程大小））
{
//在CPU内存和GPU内存中为大矩阵及其子矩阵分配空间
//它必须是这样的（很多循环）
//填充大数组
对于（x＝0；席胡）在将数据从BigyMatLIX复制到ISUBUMIX之前，没有看到你正在分配内存。你没有得到分割错误吗？而且，我看不到主机到设备的任何CUDAMEMCPY。你在分配ISUBUME矩阵的内存，但是从来没有拷贝任何数据到它。@斯卡尔佩兹：虽然我欣赏幽默，但它不是G。根据一个机智的标题对问题进行投票是个好主意。1.我这样做了，然后使用float-isub_矩阵=新float[THREAD-SIZETHREAD_SIZE]；（与osub_矩阵相同）2.我还创建了另一个变量来执行你的建议：cudaMemcpy（d_-isub_矩阵，isub_矩阵，（（THREAD-SIZE*THREAD_SIZE）*sizeof（float）），cudamemcpyhostodice）；3.好的，我这样做了：）谢谢你的更正1。（而且…）它不是真的有用。我删除了这部分：Pnow没有堆栈溢出错误…但是它必须链接并“创建清单”，它失败了。那么还有其他问题吗？：（我已经更新了更正。你能告诉我需要解决的错误是什么吗？
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#define ARRAY_SIZE 2048
#define THREAD_SIZE 32
#define PI 3.14



__global__ void twiddle_factor(float *isub_matrix, float *osub_matrix)
{
    __shared__ float block[THREAD_SIZE][THREAD_SIZE];
    int x,y,z;
    unsigned int xIndex = threadIdx.x;
    unsigned int yIndex = threadIdx.y;

    float sum_sines=0.0;
    //float expo_sums;
    float sum_cosines=0.0;
    float sum_sin[THREAD_SIZE][THREAD_SIZE],sum_cos[THREAD_SIZE][THREAD_SIZE];
    float angle=(2*PI)/THREAD_SIZE;

    //put into shared memory the FFT calculation (F(u))

    for(x=0;x<THREAD_SIZE;x++)
    {
        for(y=0;y<THREAD_SIZE;y++)
        {
            for(z=0;z<THREAD_SIZE;z++)
            {
                sum_sines=sum_sines+sin(isub_matrix[y*THREAD_SIZE+z]*(angle*z));
                sum_cosines=sum_cosines+cos(isub_matrix[y*THREAD_SIZE+z]*(angle*z));

            }
            sum_sin[x][y]=sum_sines/THREAD_SIZE;
            sum_cos[x][y]=sum_cosines/THREAD_SIZE;

        }
    }


    if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
    {
        block[xIndex][yIndex]=exp(sum_sin[xIndex][yIndex])+exp(sum_cos[xIndex][yIndex]);
    }




        __syncthreads();

    //transposition X x Y
    //transfer back the results into another sub-matrix that is allocated in CPU

    if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
            osub_matrix[yIndex*THREAD_SIZE+xIndex]=block[xIndex][yIndex];



    __syncthreads();
}


int main(int argc, char** argv) 
{
        int array_plus=0,x,y;
        float time;
        //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE;
        //bool array_rest;
        cudaEvent_t start,stop;
        float *d_isub_matrix,*d_osub_matrix;

        float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
        float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE];

        //if the array's size is not compatible with the thread's size, it won't work.

        //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE);
        //isub_matrix=(float*) malloc(memsize);
        //osub_matrix=(float*) malloc(memsize);

        if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE))
        {

            //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes
            //it has to be like this (lots of loops)



            //populating the big array
            for(x=0;x<ARRAY_SIZE;x++)
            {
                for(y=0;y<ARRAY_SIZE;y++)
                    big_matrix[x*ARRAY_SIZE+y]=rand()%10000;
            }

            //kind of loop for the big array

            //Start counting the time of processing (everything)
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            cudaEventRecord(start,0);

            while(array_plus<ARRAY_SIZE)
            {

                //putting the big array's values into the sub-matrix

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
                }

                cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);

                //call the cuda kernel

                twiddle_factor<<<1,256>>>(d_isub_matrix,d_osub_matrix);//<----

                cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

                array_plus=array_plus+THREAD_SIZE;
                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        big_matrix2[x*THREAD_SIZE+array_plus+y]=osub_matrix[x*THREAD_SIZE+y];
                }


                cudaFree(isub_matrix);
                cudaFree(osub_matrix);
                cudaFree(d_osub_matrix);
                cudaFree(d_isub_matrix);
            }

            //Stop the time

            cudaEventRecord(stop,0);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&time,start,stop);

            //Free memory in GPU

 cudaMemcpy(osub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

    for(x=0;x<THREAD_SIZE;x++)
    {
        for(y=0;y<THREAD_SIZE;y++)
            isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
    }

        //putting the big array's values into the sub-matrix

        for(x=0;x<THREAD_SIZE;x++)
        {
            for(y=0;y<THREAD_SIZE;y++)
                isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
        }

for(x=0;x<THREAD_SIZE;x++)
            {
                for(y=0;y<THREAD_SIZE;y++)
                    isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus)*ARRAY_SIZE+y];
            }

twiddle_factor<<<1,256>>>(d_isub_matrix,d_osub_matrix);

cudaMemcpy(osub_matrix,d_osub_matrix, ((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

int main(int argc, char** argv)
{
        int array_plus=0,x,y;
        int array_plus_x, array_plus_y;
        float time;
        //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE;
        //bool array_rest;
        cudaEvent_t start,stop;
        float *d_isub_matrix,*d_osub_matrix;

        float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
        float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE];

        //if the array's size is not compatible with the thread's size, it won't work.

        //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE);
        //isub_matrix=(float*) malloc(memsize);
        //osub_matrix=(float*) malloc(memsize);

        if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE))
        {

            //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes
            //it has to be like this (lots of loops)



            //populating the big array
            for(x=0;x<ARRAY_SIZE;x++)
            {
                for(y=0;y<ARRAY_SIZE;y++)
                    big_matrix[x*ARRAY_SIZE+y]=rand()%10000;
            }

            //kind of loop for the big array

            //Start counting the time of processing (everything)
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            cudaEventRecord(start,0);
            for(array_plus_x = 0; array_plus_x < ARRAY_SIZE; array_plus_x += THREAD_SIZE)
            for(array_plus_y = 0; array_plus_y < ARRAY_SIZE; array_plus_y += THREAD_SIZE)
            {


                //putting the big array's values into the sub-matrix

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)];
                }

                cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);

                //call the cuda kernel

                dim3 block(32,32);
                twiddle_factor<<<1,block>>>(d_isub_matrix,d_osub_matrix);//<----

                cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        big_matrix2[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)]=osub_matrix[x*THREAD_SIZE+y];
                }

                cudaFree(d_osub_matrix);
                cudaFree(d_isub_matrix);
            }

            //Stop the time

            cudaEventRecord(stop,0);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&time,start,stop);

            //Free memory in GPU