CUDA-“；不支持未对齐的内存访问”；_C_Visual Studio 2012_Cuda

CUDA-“；不支持未对齐的内存访问”；

c visual-studio-2012 cuda

CUDA-“；不支持未对齐的内存访问”；,c,visual-studio-2012,cuda,C,Visual Studio 2012,Cuda,关于此计划的前几项问题：我正在使用Visual Studio 2012和CUDA 6 代码应该使用CUDA将模糊效果添加到BMP文件中。在转为CUDA之前，一切都很顺利。这是我与C和CUDA的第一个项目，所以我可能犯了一些愚蠢的错误。我的代码中有76个错误，其中大多数是“此声明没有存储类或类型说明符”，还有更多没有任何意义的错误。我在Hello World程序之前尝试过，效果不错。也有同样的错误，所以我并不真正关心它们但我有两个不同的错误： Error 2 error : U

关于此计划的前几项问题：

我正在使用Visual Studio 2012和CUDA 6

代码应该使用CUDA将模糊效果添加到BMP文件中。在转为CUDA之前，一切都很顺利。这是我与C和CUDA的第一个项目，所以我可能犯了一些愚蠢的错误。我的代码中有76个错误，其中大多数是“此声明没有存储类或类型说明符”，还有更多没有任何意义的错误。我在Hello World程序之前尝试过，效果不错。也有同样的错误，所以我并不真正关心它们

但我有两个不同的错误：

Error    2    error : Unaligned memory accesses not supported  C:\Users\Karpińscy\documents\visual studio 2012\Projects\blur\blur\kernel.cu    blur

以及：

错误3错误MSB3721:命令“”C:\Program Files\NVIDIA GPU 计算工具包\CUDA\v6.0\bin\nvcc.exe“ -gencode=arch=compute\U 10，code=\“sm\U 10，compute\U 10\”--使用本地环境--cl版本2012-ccbin“C:\Program Files（x86）\Microsoft Visual Studio 11.0\VC\bin”-I“C:\Program Files\NVIDIA GPU Computing” Toolkit\CUDA\v6.0\include“-I”C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include“-G-maxregcount=0--machine 32 --compile-cudart static-g-DWIN32-D_DEBUG-D_CONSOLE-D_MBCS-Xcompiler”/EHsc/W3/nologo/Od/Zi/RTC1/MDd“-o DEBUG\kernel.cu.obj”C:\Users\karpinscy\documents\visualstudio 2012\Projects\blur\blur\kernel.cu“”已退出，代码为2。C:\程序文件（x86）\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 6.0.5目标597 9模糊

我甚至在google.com的第二个网站上搜索过答案，但我还并没有找到适合我的解决方案。请帮帮我

程序代码：

#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>


#pragma pack(push,1)
/* Windows 3.x bitmap file header */
typedef struct {
    char         filetype[2];   /* magic - always 'B' 'M' */
    unsigned int filesize;
    short        reserved1;
    short        reserved2;
    unsigned int dataoffset;    /* offset in bytes to actual bitmap data */
} file_header;

/* Windows 3.x bitmap full header, including file header */
typedef struct {
    file_header  fileheader;
    unsigned int headersize;
    int          width;
    int          height;
    short        planes;
    short        bitsperpixel;  /* we only support the value 24 here */
    unsigned int compression;   /* we do not support compression */
    unsigned int bitmapsize;
    int          horizontalres;
    int          verticalres;
    unsigned int numcolors;
    unsigned int importantcolors;
} bitmap_header;
#pragma pack(pop)

__global__ void blur(bitmap_header* hp, unsigned char *data)
{
    int xx,yy,x,y, avgB, avgG, avgR, ile;
    int blurSize = 5;

    xx = blockIdx.y * blockDim.y + threadIdx.y;
    yy = blockIdx.x * blockDim.x + threadIdx.x;

    if(xx >= hp->width || yy >= hp->height)
        return;


    avgB = avgG = avgR = 0;
    ile = 0;

    for(x = xx; x < hp->width && x < xx + blurSize; x++)
    {


        for(y = yy; y < hp->height && y < yy + blurSize; y++)
        {
            avgB += data[x*3 + y*hp->width*3 + 0];
            avgG += data[x*3 + y*hp->width*3 + 1];
            avgR += data[x*3 + y*hp->width*3 + 2];
            ile++;
        }
    }

    avgB = avgB / ile;
    avgG = avgG / ile;
    avgR = avgR / ile;

    data[xx*3 + yy*hp->width*3 + 0] = avgB;
    data[xx*3 + yy*hp->width*3 + 1] = avgG;
    data[xx*3 + yy*hp->width*3 + 2] = avgR;
}

int filter(char* input, char *output)
{
    FILE *fp,*out;
    bitmap_header* hp;
    bitmap_header* d_hp;
    unsigned char *data;
    unsigned char *d_data;

    //Open input file:
    fp = fopen(input, "r");
    if(fp==NULL)
        return 1;

    //Read the input file headers:
    hp=(bitmap_header*)malloc(sizeof(bitmap_header));

    cudaMalloc( &d_hp, sizeof(bitmap_header));

    if(hp==NULL)
        return 1;

    fread(hp, sizeof(bitmap_header), 1, fp);

    cudaMemcpy(d_hp, hp, sizeof(bitmap_header), cudaMemcpyHostToDevice);

    //Read the data of the image:
    data = (unsigned char*)malloc(sizeof(char)*hp->bitmapsize);

    cudaMalloc( &d_data, sizeof(char)*hp->bitmapsize);

    fseek(fp,sizeof(char)*hp->fileheader.dataoffset,SEEK_SET);
    fread(data,sizeof(char),hp->bitmapsize, fp);

    cudaMemcpy(d_data, data, sizeof(char)*hp->bitmapsize, cudaMemcpyHostToDevice);

    //Not sure if correctly calling function
    dim3 block(16,16);
    dim3 grid ( (hp->height + 15)/16, (hp->width + 15)/16 );
    blur<<<grid,block>>>(d_hp, d_data);

    cudaMemcpy(data, d_data, sizeof(char)*hp->bitmapsize, cudaMemcpyDeviceToHost);

    //Open output file:
    out = fopen(output, "wb");
    if(out==NULL)
    {
        fclose(fp);
        free(hp);
        free(data);
        cudaFree(d_data);
        cudaFree(d_hp);
        return 1;
    }

    fwrite(hp,sizeof(char),sizeof(bitmap_header),out);

    fseek(out,sizeof(char)*hp->fileheader.dataoffset,SEEK_SET);
    fwrite(data,sizeof(char),hp->bitmapsize,out);

    fclose(fp);
    fclose(out);
    free(hp);
    free(data);

    cudaFree(d_data);
    cudaFree(d_hp);
    return 0;
}

int main(int argc, char* argv[])
{
    char *path = "file.bmp";
    filter(path,path);

    return 0;
}

#包括
#包括
#包括
#pragma包（推送，1）
/*Windows 3.x位图文件头*/
类型定义结构{
char文件类型[2]；/*magic-始终为“B”“M”*/
无符号整数文件大小；
短期储备1；
储备不足2；
unsigned int dataoffset；/*实际位图数据的字节偏移量*/
}文件头；
/*Windows 3.x位图完整头，包括文件头*/
类型定义结构{
文件头文件头；
无符号整数标题；
整数宽度；
内部高度；
短平面；
short bitsperpixel；/*我们这里只支持值24*/
无符号整数压缩；/*我们不支持压缩*/
无符号整数位图大小；
内水平线；
垂直方向；
无符号整数颜色；
未签名的国际重要颜色；
}位图头；
#布拉格语包（流行语）
__全局无效模糊（位图标题*hp，无符号字符*数据）
{
int xx、yy、x、y、avgB、avgG、avgR、ile；
int-fullsize=5；
xx=块IDX.y*块尺寸y+线程IDX.y；
yy=blockIdx.x*blockDim.x+threadIdx.x；
如果（xx>=hp->宽度| yy>=hp->高度）
返回；
avgB=avgG=avgR=0；
ile=0；
对于（x=xx；xwidth&&xheight&&y宽度*3+0]；
avgG+=数据[x*3+y*hp->宽度*3+1]；
avgR+=数据[x*3+y*hp->宽度*3+2]；
ile++；
}
}
avgB=avgB/ile；
avgG=avgG/ile；
avgR=avgR/ile；
数据[xx*3+yy*hp->宽度*3+0]=avgB；
数据[xx*3+yy*hp->宽度*3+1]=avgG；
数据[xx*3+yy*hp->宽度*3+2]=avgR；
}
int过滤器（字符*输入，字符*输出）
{
文件*fp，*out；
位图_头*hp；
位图_头*d_马力；
无符号字符*数据；
无符号字符*d_数据；
//打开输入文件：
fp=fopen（输入，“r”）；
如果（fp==NULL）
返回1；
//读取输入文件标题：
hp=（位图_头*）malloc（sizeof（位图_头））；
cudaMalloc（&d_hp，sizeof（位图_头））；
如果（hp==NULL）
返回1；
fread（hp，sizeof（位图头），1，fp）；
cudaMemcpy（d_hp，hp，sizeof（位图_头），cudaMemcpyHostToDevice）；
//读取图像的数据：
数据=（无符号字符*）malloc（sizeof（字符）*hp->bitmapsize）；
cudaMalloc（&d_数据，大小（字符）*hp->bitmapsize）；
fseek（fp，sizeof（char）*hp->fileheader.dataoffset，SEEK\u SET）；
fread（数据，sizeof（char），hp->bitmapsize，fp）；
cudaMemcpy（d_数据，数据，大小（字符）*hp->bitmapsize，cudaMemcpyHostToDevice）；
//不确定是否正确调用了函数
dim3区块（16,16）；
dim3网格（（hp->高度+15）/16，（hp->宽度+15）/16）；
模糊（d_hp，d_数据）；
cudaMemcpy（数据、数据、大小（字符）*hp->bitmapsize、cudaMemcpyDeviceToHost）；
//打开输出文件：
out=fopen（输出，“wb”）；
if（out==NULL）
{
fclose（fp）；
免费（hp）；
免费（数据）；
cudaFree（d_数据）；
cudaFree（d_hp）；
返回1；
}
fwrite（hp、sizeof（char）、sizeof（位图_头）、out）；
fseek（out，sizeof（char）*hp->fileheader.dataoffset，SEEK\u SET）；
fwrite（数据，大小（字符），hp->bitmapsize，out）；
fclose（fp）；
fclose（out）；
免费（hp）；
免费（数据）；
cudaFree（d_数据）；
cudaFree（d_hp）；
返回0；
}
int main（int argc，char*argv[]）
{
char*path=“file.bmp”；
过滤器（路径，路径）；
返回0；
}

我被要求从实现错误检查，但我不知道它如何或是否真的能帮助我

编辑：

多亏了@DanielKamilKozar，我解决了这些问题。程序可以编译，但模糊不会添加到BMP文件中。CUDA语法是否正确调用了blur函数？

我解决了这个问题，没有按函数参数发送完整的BMP头，而是发送必要的内容。我还有一个函数没有被调用的问题，我通过更新CUDA软件解决了这个问题。

我能够通过将arch值从sm_10更改为sm_20来解决这个问题。我的应用程序运行在Win 8.1 x64 VS2012上的GT750M上。

我对CUDA几乎没有任何经验或知识，但行

数据[x*3+y*hp->width*3+2]