从CUDA中的线程ID获取int
我是CUDA的新手。我需要在计算中使用线程id,但它不起作用。rem始终为0。我需要线程的索引来计算数组中的索引,所以我不能将它们转换为浮点来进行计算 内核如下所示:从CUDA中的线程ID获取int,cuda,parallel-processing,gpu,Cuda,Parallel Processing,Gpu,我是CUDA的新手。我需要在计算中使用线程id,但它不起作用。rem始终为0。我需要线程的索引来计算数组中的索引,所以我不能将它们转换为浮点来进行计算 内核如下所示: _global__ void initializationCubes(float* dVer, float* dCub, int gridSize, float* test) { int index=blockIdx.x*blockDim.x+threadIdx.x; if(index<(gridSi
_global__ void initializationCubes(float* dVer, float* dCub, int gridSize, float* test)
{
int index=blockIdx.x*blockDim.x+threadIdx.x;
if(index<(gridSize*gridSize*gridSize))
{
// conversion index -> i,j,k
int rem=index;
int qot=(rem/gridSize);
int i=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int j=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int k=rem-(qot*gridSize);
for(int x=0;x<7;x++){
// these first three are used to test
dCub[index*56+0+x] =index;
dCub[index*56+7+x] =rem;
dCub[index*56+14+x]=k;
dCub[index*56+21+x]=dVer[((i*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
dCub[index*56+28+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k)*7+x];
dCub[index*56+35+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k+1)*7+x];
dCub[index*56+42+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k+1)*7+x];
dCub[index*56+49+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
}
}
}
__global__ void initializationVertices(float* dVer, int gridSize){
int currentVertex=0;
for(int i=0; i<gridSize+1; i++)
{
for(int j=0; j<gridSize+1; j++)
{
for(int k=0; k<gridSize+1; k++)
{
dVer[currentVertex+0]=((i*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+1]=((j*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+2]=((k*2.0f)/(gridSize)-1.0f)*2.0f;
currentVertex+=7;
}
}
}
extern "C"
void initializationCUDA1( const int verticesAtEndsOfEdges[24], const int eTable[256], int gSize, int numberParticles ) {
numParticles=numberParticles;
gridSize=gSize;
numVertices=(gridSize+1)*(gridSize+1)*(gridSize+1);
numCubes=(gridSize)*(gridSize)*(gridSize);
size_t pitchv=7;
cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)*(gridSize+1)*(gridSize+1));
size_t pitchc=7;
cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)*(gridSize)*(gridSize)*8);
cudaMalloc((void **)&verticesAtEnds, 24*sizeof(int));
cudaMalloc((void **)&dedgeTable, 256*sizeof(int));
cudaMalloc((void **)&dtriTable, 256*16*sizeof(int));
cudaMalloc((void **)&ballPoint, 3*sizeof(float));
cudaMalloc((void **)&dpositions, 3*numberParticles*sizeof(float));
cudaMalloc((void **)&dedgeVertices, numCubes*6*12*sizeof(float));
cudaMalloc((void **)&result, numCubes*18*sizeof(float));
output=(float*)malloc(numCubes*18*sizeof(float));
cudaMalloc((void **)&numFaces, 10*sizeof(int));
cudaMalloc((void **)&test, sizeof(float));
initializationVertices<<<1,1>>>(dVer, gridSize);
initializationCubes<<<128,256>>>( dVer, dCub, gridSize, test);
float* tmp =(float*)malloc(numCubes*56*(sizeof(float)));
cudaMemcpy(tmp, dCub, numCubes*56*sizeof(float), cudaMemcpyDeviceToHost);
for(int a=0;a<100;a++){
printf("%f\n",tmp[a]);
}
}
\u全局\u\u无效初始化多维数据集(float*dVer、float*dCub、int-gridSize、float*test)
{
int index=blockIdx.x*blockDim.x+threadIdx.x;
if(指数i,j,k)
int-rem=指数;
int qot=(rem/gridSize);
int i=rem-(qot*网格大小);
rem=(rem)/(网格大小);
qot=(雷姆/网格大小);
int j=rem-(qot*网格大小);
rem=(rem)/(网格大小);
qot=(雷姆/网格大小);
int k=rem-(qot*网格大小);
对于(int x=0;x如果gridSize
是网格的大小,顾名思义,rem
和qot
在执行代码后将始终为零,因为它们被大于自身的值除以
如果要在三维网格中查找索引,这正是threadIdx
和blockIdx
有三个组件的原因。根本不需要昂贵的划分,只需使用以下标准代码段:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i < myBlockSize.x && j < myBlockSize.y && k<myBlockSize.z) {
// your kernel code...
}
inti=blockIdx.x*blockDim.x+threadIdx.x;
int j=blockIdx.y*blockDim.y+threadIdx.y;
int k=blockIdx.z*blockDim.z+threadIdx.z;
如果(我)MyBaskSig.x&&MyBuffigy.y& & Ky,但是这不是我需要的。我用线性的方式来排列数组,我需要用这个数组来计算另一个数组中的索引。我在C++中测试了索引计算,它工作得很好。它只需要一个有效的int来开始抱歉,但是我不理解你的问题。x
你已经有了?请展示完整的代码。顺便说一句,你仍然可以不用昂贵的除法——只需从三个分量中计算线性索引,而不是反过来计算。这就是我正在做的,但它不起作用。这是我试图将我的大数组的初始化展平以简化它。如果gridSize是内核网格的大小您可以尝试使用预定义的常量gridDim.{x,y,z}。这对解决问题没有帮助,但可能会提高性能。gridSize与cuda部分无关。这是我算法的一个参数。您如何知道rem
始终为0?当您看到问题时,gridSize
的内核启动参数和值是什么?您只使用单个线程运行内核!Thus index永远只为0,您只需从该1个线程中写出索引设置为0时计算的结果……也可以在所有cuda调用(cudaMalloc、cudaMemcpy、内核调用等)上执行此操作