Cuda C线程与printf或其他函数同步_Cuda_Printf

Cuda C线程与printf或其他函数同步

cuda

Cuda C线程与printf或其他函数同步,cuda,printf,Cuda,Printf,在块执行期间，线程的id有问题。我想有这样的句子：“我的临时字符串是通过GPU打印的！”正如你所看到的（在附上的照片上），这个句子显示错误，我不知道如何修复它代码： \uuuuu全局\uuuuuu无效打印（常量字符*常量字符串，常量大小*常量循环\u重复） { int id_x=threadIdx.x+blockIdx.x*blockDim.x；而（id_x（字符串\u GPU，我的字符串\u长度）； cudaError\u t final\u error=cudaDeviceSynch

在块执行期间，线程的id有问题。我想有这样的句子：“我的临时字符串是通过GPU打印的！”正如你所看到的（在附上的照片上），这个句子显示错误，我不知道如何修复它

代码：

\uuuuu全局\uuuuuu无效打印（常量字符*常量字符串，常量大小*常量循环\u重复）
{ 
int id_x=threadIdx.x+blockIdx.x*blockDim.x；
而（id_x<静态_投射（*循环_重复））
{
printf（“%c”，_uuu字符串[id_x]）；
__同步线程（）；
id_x+=blockDim.x*gridDim.x；
}
}
int main（）
{
const char*my_string=“我的临时字符串通过GPU打印！”；
大小{}；
temp=Get_String_Length（my_String）；//获取字符串长度
//GPU内存分配
大小*我的字符串长度{}；
Cudamaloc（（void**）和my_字符串长度，sizeof（size_t））；
//将值从CPU（RAM）复制到GPU
cudaMemcpy（我的字符串长度、温度、大小、主机设备）；
字符*string_GPU{}；
cudamaloc（（void**）和stringGPU（temp）*sizeof（char））；
//将值从CPU（RAM）复制到GPU
cudaMemcpy（string\u GPU，my\u string，（temp）*sizeof（char），HostToDevice）；
dim3网格尺寸（1）；
dim3块_尺寸（（温度））；
打印>（字符串\u GPU，我的字符串\u长度）；
cudaError\u t final\u error=cudaDeviceSynchronize（）；//用于同步，例如Hello\u World然后printf
如果（最终错误==cudaSuccess）
{
printf（“%cKernel已成功执行，代码：%d！%\n”，新行，最终错误）；
}
其他的
{
printf（“%cKernel已执行，代码错误：%d！\n”，新的\u行，最终的\u错误）；
}
cudaFree（我的字符串长度）；
cudaFree（字符串GPU）；
返回0；
}

我将非常感谢您提供的任何帮助。

发生这种情况是因为多处理器以32个并行线程（称为warps）为一组创建、管理、调度和执行线程，如您在中所见，因此前32个线程包括“我的临时字符串打印为v”，其余部分包括“ia GPU！”。内核似乎在执行顺序上将后一个换行放在第一个换行之前。
这里的主要问题是，您希望线程或扭曲执行顺序具有某种可预测的顺序。事实上，事实并非如此。您使用的
\uuu syncthreads（）
无法修复或解决此问题
如果希望扭曲以可预测的顺序执行（不推荐），则需要自己强制执行该顺序。下面是一个示例，它演示了如何使用这个非常简单的代码。如果不修改较大的字符串，它是不可扩展的，并且如果引入多个threadblock，此方法将完全崩溃

$ cat t1543.cu #include <stdio.h> #include <stdlib.h> __global__ void Print(const char* const __string, const size_t* const loop_repeat) { int id_x = threadIdx.x + blockIdx.x * blockDim.x; int warp_ID = threadIdx.x>>5; while (id_x < static_cast<int>(*loop_repeat)) { if (warp_ID == 0) printf("%c", __string[id_x]); __syncthreads(); if (warp_ID == 1) printf("%c", __string[id_x]); __syncthreads(); id_x += blockDim.x * gridDim.x; } } int main() { const char* my_string = "My temporary string is printed via GPU!"; size_t temp; temp = 40; //get the string length //GPU MEMORY ALLOCATION size_t* my_string_length; cudaMalloc((void**)&my_string_length, sizeof(size_t)); //COPY VALUE FROM CPU(RAM) TO GPU cudaMemcpy(my_string_length, &temp, sizeof(size_t), cudaMemcpyHostToDevice); char* string_GPU; cudaMalloc((void**)&string_GPU, (temp) * sizeof(char)); //COPY VALUE FROM CPU(RAM) TO GPU cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), cudaMemcpyHostToDevice); dim3 grid_size(1); dim3 block_size((temp)); Print <<< grid_size, temp >>> (string_GPU, my_string_length); cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf if (final_error == cudaSuccess) { printf("\nKernel executed successfully with code: %d !%\n", final_error); } else { printf("\nKernel executed with code error: %d !\n", final_error); } cudaFree(my_string_length); cudaFree(string_GPU); return 0; } $ nvcc -o t1543 t1543.cu $ cuda-memcheck ./t1543 ========= CUDA-MEMCHECK My temporary string is printed via GPU! Kernel executed successfully with code: 0 !% ========= ERROR SUMMARY: 0 errors $

$cat t1543.cu #包括 #包括 __全局无效打印（常量字符*常量字符串，常量大小*常量循环\u重复） { int id_x=threadIdx.x+blockIdx.x*blockDim.x； int warp_ID=threadIdx.x>>5；而（id_x<静态_投射（*循环_重复）） { if（warp_ID==0） printf（“%c”，_uuu字符串[id_x]）； __同步线程（）； if（warp_ID==1） printf（“%c”，_uuu字符串[id_x]）； __同步线程（）； id_x+=blockDim.x*gridDim.x； } } int main（） { const char*my_string=“我的临时字符串通过GPU打印！”；尺寸和温度； temp=40；//获取字符串长度 //GPU内存分配大小*我的字符串长度； Cudamaloc（（void**）和my_字符串长度，sizeof（size_t））； //将值从CPU（RAM）复制到GPU cudaMemcpy（我的字符串长度和温度、大小f（大小t）、cudaMemcpyHostToDevice）；字符*字符串\GPU； cudamaloc（（void**）和stringGPU（temp）*sizeof（char））； //将值从CPU（RAM）复制到GPU cudaMemcpy（string\u GPU，my\u string，（temp）*sizeof（char），cudaMemcpyHostToDevice）； dim3网格尺寸（1）； dim3块_尺寸（（温度））；打印>（字符串\u GPU，我的字符串\u长度）； cudaError\u t final\u error=cudaDeviceSynchronize（）；//用于同步，例如Hello\u World然后printf 如果（最终错误==cudaSuccess） { printf（“\nKernel成功执行，代码：%d！%\n”，最终错误）； } 其他的 { printf（“\nKernel执行时出现代码错误：%d！\n”，最终错误）； } cudaFree（我的字符串长度）； cudaFree（字符串GPU）；返回0； } $nvcc-o t1543 t1543.cu $cuda memcheck./t1543 ==========CUDA-MEMCHECK 我的临时字符串是通过GPU打印的！内核已成功执行，代码为：0！% ======错误摘要：0个错误 $

注意，我并不是说上面的代码风格很好。它是为了理解这个问题而提供的。即使是这段代码也依赖于这样一种想法，即warp中的线程将以可预测的顺序调用
printf
函数，而CUDA编程模型并不保证这一点。因此，代码实际上仍然不完整。
显示整个代码以帮助您。我现在就这样做了。欢迎使用哪些库？stdio.h stdlib.h和CUDAT的内置基本项目还有许多其他未定义的内容，例如
Get\u String\u Length
和
HostToDevice
。谢谢Robert！我知道线程是以不可预测的顺序执行的，但我想知道是否有可能获得我想要的线程执行顺序。问候语！谢谢！我非常感激！
$ cat t1543.cu #include <stdio.h> #include <stdlib.h> __global__ void Print(const char* const __string, const size_t* const loop_repeat) { int id_x = threadIdx.x + blockIdx.x * blockDim.x; int warp_ID = threadIdx.x>>5; while (id_x < static_cast<int>(*loop_repeat)) { if (warp_ID == 0) printf("%c", __string[id_x]); __syncthreads(); if (warp_ID == 1) printf("%c", __string[id_x]); __syncthreads(); id_x += blockDim.x * gridDim.x; } } int main() { const char* my_string = "My temporary string is printed via GPU!"; size_t temp; temp = 40; //get the string length //GPU MEMORY ALLOCATION size_t* my_string_length; cudaMalloc((void**)&my_string_length, sizeof(size_t)); //COPY VALUE FROM CPU(RAM) TO GPU cudaMemcpy(my_string_length, &temp, sizeof(size_t), cudaMemcpyHostToDevice); char* string_GPU; cudaMalloc((void**)&string_GPU, (temp) * sizeof(char)); //COPY VALUE FROM CPU(RAM) TO GPU cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), cudaMemcpyHostToDevice); dim3 grid_size(1); dim3 block_size((temp)); Print <<< grid_size, temp >>> (string_GPU, my_string_length); cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf if (final_error == cudaSuccess) { printf("\nKernel executed successfully with code: %d !%\n", final_error); } else { printf("\nKernel executed with code error: %d !\n", final_error); } cudaFree(my_string_length); cudaFree(string_GPU); return 0; } $ nvcc -o t1543 t1543.cu $ cuda-memcheck ./t1543 ========= CUDA-MEMCHECK My temporary string is printed via GPU! Kernel executed successfully with code: 0 !% ========= ERROR SUMMARY: 0 errors $