Cuda 内核中的新运算符。。奇怪的行为_Cuda

Cuda 内核中的新运算符。。奇怪的行为

cuda

Cuda 内核中的新运算符。。奇怪的行为,cuda,Cuda,我想知道是否有人可以通过内核中的新操作符来解释这种行为。。下面是代码 #include <stdio.h> #include "cuda_runtime.h" #include "cuComplex.h" using namespace std; __global__ void test() { cuComplex *store; store= new cuComplex[30000]; if (store==NULL) printf("Unable to

我想知道是否有人可以通过内核中的新操作符来解释这种行为。。下面是代码

#include <stdio.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
using namespace std;
__global__ void test()
{

    cuComplex *store;
    store= new cuComplex[30000];
    if (store==NULL) printf("Unable to allocate %i\n",blockIdx.y);
    delete store;
    if (threadIdx.x==10000) store->x=0.0;
}

int main(int argc, char *argv[])
{
    float timestamp;
    cudaEvent_t event_start,event_stop;
    // Initialise


    cudaEventCreate(&event_start);
    cudaEventCreate(&event_stop);
    cudaEventRecord(event_start, 0);
    dim3 threadsPerBlock;
    dim3 blocks;
    threadsPerBlock.x=1;
    threadsPerBlock.y=1;
    threadsPerBlock.z=1;
    blocks.x=1;
    blocks.y=500;
    blocks.z=1;

    cudaEventRecord(event_start);
    test<<<blocks,threadsPerBlock,0>>>();
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test took  %fms \n", timestamp);
}

#包括
#包括“cuda_runtime.h”
#包括“cuComplex.h”
使用名称空间std；
__全局无效测试（）
{
cuComplex*商店；
存储=新的cuComplex[30000]；
if（store==NULL）printf（“无法分配%i\n”，blockIdx.y）；
删除存储；
如果（threadIdx.x==10000）存储->x=0.0；
}
int main（int argc，char*argv[]）
{
浮动时间戳；
cudaEvent事件启动、事件停止；
//初始化
cudaEventCreate（&event_start）；
cudaEventCreate（&event_-stop）；
cudaEventRecord（事件开始，0）；
dim3螺纹锁紧；
dim3块；
threadsPerBlock.x=1；
threadsPerBlock.y=1；
threadsPerBlock.z=1；
块x=1；
块。y=500；
块。z=1；
cudaEventRecord（事件启动）；
test（）；
cudaEventRecord（事件停止，0）；
cudaEventSynchronize（事件停止）；
CudaEventReleasedTime（&时间戳、事件开始、事件停止）；
printf（“测试采用%fms\n”，时间戳）；
}

在GTX680 Cuda 5上运行此操作并调查输出时，会发现未随机分配内存：（我想可能是因为所有的全局内存都完成了，但是我有2GB内存，因为活动块的最大数量是16，这个方法分配的内存量应该最大为16×30000×8＝34.4x10e6.. IE大约38兆字节。所以我还应该考虑什么？< /P> < P>这个问题与堆的大小有关。在英伟达CUDA C编程指南中，请参阅< MulLoCube和For（）设备系统调用。参见Novidia CUDA C编程指南中的“代码> >第3.2.9节调用堆栈和<代码>附录B.16.1堆内存分配< /代码>。如果您将堆大小设置为符合内核要求，那么测试将正常工作

    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 500*30000*sizeof(cuComplex));

分配的内存应为

16*30000*sizeof（cuComplex）

还要注意，

threadIdx.x

永远不会等于

这可能只是运行时堆上的内存碎片。您是否尝试过将运行时堆大小增加一点？是的，但是cuComplex只有8个字节，所以总和只会增加到32Mb。仍然有更多可用内存（2GB）.关于threadIdx.x==1000，这只是为了愚弄编译器，而不是让它忽略存储variable@tolanmies..我插入了代码cudaDeviceSetLimit（cudaLimitMallocHeapSize，30000*8*400）；它起作用了……但是我如何计算到需要增加的大小呢？但是这会消耗我所有的内存……而且实际上并不需要，因为只有来自6个块的分配是可能的……有没有办法对限制进行更精细的计算呢？更改

用于

并发块x流式多处理器

以获得mi所需的最小大小，在gtx680中为

SM=8

，并发块的数量取决于内核对寄存器或共享内存的要求。