cudaSetDevice()对CUDA设备有什么作用;什么是上下文堆栈?
假设我有一个与设备cudaSetDevice()对CUDA设备有什么作用;什么是上下文堆栈?,cuda,cuda-context,cuda-driver,Cuda,Cuda Context,Cuda Driver,假设我有一个与设备I关联的活动CUDA上下文,现在我调用cudaSetDevice(I)。发生了什么事 什么都没有 主上下文是否替换堆栈的顶部 是否将主上下文推送到堆栈上 事实上,这似乎是不一致的。我编写了这个程序,在一台只有一个设备的机器上运行: #include <cuda.h> #include <cuda_runtime_api.h> #include <cassert> #include <iostream> int main() {
I
关联的活动CUDA上下文,现在我调用cudaSetDevice(I)
。发生了什么事
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
int main()
{
CUcontext ctx1, primary;
cuInit(0);
auto status = cuCtxCreate(&ctx1, 0, 0);
assert (status == (CUresult) cudaSuccess);
cuCtxPushCurrent(ctx1);
status = cudaSetDevice(0);
assert (status == cudaSuccess);
void* ptr1;
void* ptr2;
cudaMalloc(&ptr1, 1024);
assert (status == cudaSuccess);
cuCtxGetCurrent(&primary);
assert (status == (CUresult) cudaSuccess);
assert(primary != ctx1);
status = cuCtxPushCurrent(ctx1);
assert (status == (CUresult) cudaSuccess);
cudaMalloc(&ptr2, 1024);
assert (status == (CUresult) cudaSuccess);
cudaSetDevice(0);
assert (status == (CUresult) cudaSuccess);
int i = 0;
while (true) {
status = cuCtxPopCurrent(&primary);
if (status != (CUresult) cudaSuccess) { break; }
std::cout << "Next context on stack (" << i++ << ") is " << (void*) primary << '\n';
}
}
这种行为似乎有时是一种替代,有时是一种推动
发生了什么事;DR:根据您提供的代码,在这两种特定用法中,
cudaSetDevice()
似乎正在替换堆栈顶部的上下文
让我们稍微修改一下代码,然后看看我们可以推断出代码中每个API调用对上下文堆栈的影响:
$ cat t1759.cu
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
void check(int j, CUcontext ctx1, CUcontext ctx2){
CUcontext ctx0;
int i = 0;
while (true) {
auto status = cuCtxPopCurrent(&ctx0);
if (status != CUDA_SUCCESS) { break; }
if (ctx0 == ctx1) std::cout << j << ":Next context on stack (" << i++ << ") is ctx1:" << (void*) ctx0 << '\n';
else if (ctx0 == ctx2) std::cout << j << ":Next context on stack (" << i++ << ") is ctx2:" << (void*) ctx0 << '\n';
else std::cout << j << ":Next context on stack (" << i++ << ") is unknown:" << (void*) ctx0 << '\n';
}
}
void runtest(int i)
{
CUcontext ctx1, primary = NULL;
cuInit(0);
auto dstatus = cuCtxCreate(&ctx1, 0, 0); // checkpoint 1
assert (dstatus == CUDA_SUCCESS);
if (i == 1) {check(i,ctx1,primary); return;}// checkpoint 1
dstatus = cuCtxPushCurrent(ctx1); // checkpoint 2
assert (dstatus == CUDA_SUCCESS);
if (i == 2) {check(i,ctx1,primary); return;}// checkpoint 2
auto rstatus = cudaSetDevice(0); // checkpoint 3
assert (rstatus == cudaSuccess);
if (i == 3) {check(i,ctx1,primary); return;}// checkpoint 3
void* ptr1;
void* ptr2;
rstatus = cudaMalloc(&ptr1, 1024); // checkpoint 4
assert (rstatus == cudaSuccess);
if (i == 4) {check(i,ctx1,primary); return;}// checkpoint 4
dstatus = cuCtxGetCurrent(&primary); // checkpoint 5
assert (dstatus == CUDA_SUCCESS);
assert(primary != ctx1);
if (i == 5) {check(i,ctx1,primary); return;}// checkpoint 5
dstatus = cuCtxPushCurrent(ctx1); // checkpoint 6
assert (dstatus == CUDA_SUCCESS);
if (i == 6) {check(i,ctx1,primary); return;}// checkpoint 6
rstatus = cudaMalloc(&ptr2, 1024); // checkpoint 7
assert (rstatus == cudaSuccess);
if (i == 7) {check(i,ctx1,primary); return;}// checkpoint 7
rstatus = cudaSetDevice(0); // checkpoint 8
assert (rstatus == cudaSuccess);
if (i == 8) {check(i,ctx1,primary); return;}// checkpoint 8
return;
}
int main(){
for (int i = 1; i < 9; i++){
cudaDeviceReset();
runtest(i);}
}
$ nvcc -o t1759 t1759.cu -lcuda -std=c++11
$ ./t1759
1:Next context on stack (0) is ctx1:0x11087e0
2:Next context on stack (0) is ctx1:0x1741160
2:Next context on stack (1) is ctx1:0x1741160
3:Next context on stack (0) is unknown:0x10dc520
3:Next context on stack (1) is ctx1:0x1c5aa70
4:Next context on stack (0) is unknown:0x10dc520
4:Next context on stack (1) is ctx1:0x23eaa00
5:Next context on stack (0) is ctx2:0x10dc520
5:Next context on stack (1) is ctx1:0x32caf30
6:Next context on stack (0) is ctx1:0x3a44ed0
6:Next context on stack (1) is ctx2:0x10dc520
6:Next context on stack (2) is ctx1:0x3a44ed0
7:Next context on stack (0) is ctx1:0x41cfd90
7:Next context on stack (1) is ctx2:0x10dc520
7:Next context on stack (2) is ctx1:0x41cfd90
8:Next context on stack (0) is ctx2:0x10dc520
8:Next context on stack (1) is ctx2:0x10dc520
8:Next context on stack (2) is ctx1:0x4959c70
$
上下文创建还将新创建的上下文推送到堆栈上,如前所述
二,
毫不奇怪,在堆栈上推送相同的上下文会为其创建另一个堆栈条目
三,
cudaSetDevice()
调用已将堆栈顶部替换为“未知”上下文。(此时未知,因为我们尚未检索“其他”上下文的句柄值)
四,
由于此调用,堆栈配置没有差异
五,
由于此调用,堆栈配置没有差异,但我们现在知道堆栈顶部上下文是当前上下文(我们可以推测它是主上下文)
六,
这里没有真正的惊喜。我们在堆栈上按下ctx1
,因此堆栈有3个条目,第一个条目是驱动程序API创建的上下文,接下来的两个条目与步骤5中的堆栈配置相同,只是向下移动了一个堆栈位置
七,
同样,此调用对堆栈配置没有影响
八,
我们再次看到,这里的行为是cudaSetDevice()
调用已将堆栈顶部上下文替换为主上下文
我从您的测试代码中得出的结论是,当与代码中的各种运行时和驱动程序API调用混合时,cudaSetDevice()
调用的行为没有不一致
在我看来,这种编程范式是疯狂的。我无法想象为什么你会希望以这种方式混合驱动程序API和运行时API代码。哦,不,我不想这样做,这是一个人工示例,我只是想了解其行为,这样当我将API包装扩展到驱动程序API时,我就不会因为错误的假设而把事情搞砸。现在,我有了“推”和“弹出”的代码当前设备-假设仅使用运行时API。既然您已经阐明了
cudaSetDevice()
的行为方式,我可以将该代码更改为:1。查看当前上下文。2.把它放在一边。3. cudaSetDevice()
到我想使用运行时API 4使用的设备。做我的工作。5. cuCtxSetCurrent()
到我保存的上下文中。通过编辑我的答案,您做出了一个我不满意的声明,并有效地将该声明归因于我。如果你不编辑我的答案可能会更好(除非我将它们标记为社区维基)。我对此感到不舒服,我建议今后不要对我的答案如此随意。我很乐意对我观察到的东西进行陈述。就这样吧。如果您希望进一步了解,我的建议是通过developer.nvidia.com上的bug归档门户提出此类请求。如果您希望澄清,请使用评论。此外,我不会对您现在在评论中列出的5个步骤进行说明。我将尊重您的建议和要求。
$ cat t1759.cu
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
void check(int j, CUcontext ctx1, CUcontext ctx2){
CUcontext ctx0;
int i = 0;
while (true) {
auto status = cuCtxPopCurrent(&ctx0);
if (status != CUDA_SUCCESS) { break; }
if (ctx0 == ctx1) std::cout << j << ":Next context on stack (" << i++ << ") is ctx1:" << (void*) ctx0 << '\n';
else if (ctx0 == ctx2) std::cout << j << ":Next context on stack (" << i++ << ") is ctx2:" << (void*) ctx0 << '\n';
else std::cout << j << ":Next context on stack (" << i++ << ") is unknown:" << (void*) ctx0 << '\n';
}
}
void runtest(int i)
{
CUcontext ctx1, primary = NULL;
cuInit(0);
auto dstatus = cuCtxCreate(&ctx1, 0, 0); // checkpoint 1
assert (dstatus == CUDA_SUCCESS);
if (i == 1) {check(i,ctx1,primary); return;}// checkpoint 1
dstatus = cuCtxPushCurrent(ctx1); // checkpoint 2
assert (dstatus == CUDA_SUCCESS);
if (i == 2) {check(i,ctx1,primary); return;}// checkpoint 2
auto rstatus = cudaSetDevice(0); // checkpoint 3
assert (rstatus == cudaSuccess);
if (i == 3) {check(i,ctx1,primary); return;}// checkpoint 3
void* ptr1;
void* ptr2;
rstatus = cudaMalloc(&ptr1, 1024); // checkpoint 4
assert (rstatus == cudaSuccess);
if (i == 4) {check(i,ctx1,primary); return;}// checkpoint 4
dstatus = cuCtxGetCurrent(&primary); // checkpoint 5
assert (dstatus == CUDA_SUCCESS);
assert(primary != ctx1);
if (i == 5) {check(i,ctx1,primary); return;}// checkpoint 5
dstatus = cuCtxPushCurrent(ctx1); // checkpoint 6
assert (dstatus == CUDA_SUCCESS);
if (i == 6) {check(i,ctx1,primary); return;}// checkpoint 6
rstatus = cudaMalloc(&ptr2, 1024); // checkpoint 7
assert (rstatus == cudaSuccess);
if (i == 7) {check(i,ctx1,primary); return;}// checkpoint 7
rstatus = cudaSetDevice(0); // checkpoint 8
assert (rstatus == cudaSuccess);
if (i == 8) {check(i,ctx1,primary); return;}// checkpoint 8
return;
}
int main(){
for (int i = 1; i < 9; i++){
cudaDeviceReset();
runtest(i);}
}
$ nvcc -o t1759 t1759.cu -lcuda -std=c++11
$ ./t1759
1:Next context on stack (0) is ctx1:0x11087e0
2:Next context on stack (0) is ctx1:0x1741160
2:Next context on stack (1) is ctx1:0x1741160
3:Next context on stack (0) is unknown:0x10dc520
3:Next context on stack (1) is ctx1:0x1c5aa70
4:Next context on stack (0) is unknown:0x10dc520
4:Next context on stack (1) is ctx1:0x23eaa00
5:Next context on stack (0) is ctx2:0x10dc520
5:Next context on stack (1) is ctx1:0x32caf30
6:Next context on stack (0) is ctx1:0x3a44ed0
6:Next context on stack (1) is ctx2:0x10dc520
6:Next context on stack (2) is ctx1:0x3a44ed0
7:Next context on stack (0) is ctx1:0x41cfd90
7:Next context on stack (1) is ctx2:0x10dc520
7:Next context on stack (2) is ctx1:0x41cfd90
8:Next context on stack (0) is ctx2:0x10dc520
8:Next context on stack (1) is ctx2:0x10dc520
8:Next context on stack (2) is ctx1:0x4959c70
$
auto dstatus = cuCtxCreate(&ctx1, 0, 0); // checkpoint 1
1:Next context on stack (0) is ctx1:0x11087e0
dstatus = cuCtxPushCurrent(ctx1); // checkpoint 2
2:Next context on stack (0) is ctx1:0x1741160
2:Next context on stack (1) is ctx1:0x1741160
auto rstatus = cudaSetDevice(0); // checkpoint 3
3:Next context on stack (0) is unknown:0x10dc520
3:Next context on stack (1) is ctx1:0x1c5aa70
rstatus = cudaMalloc(&ptr1, 1024); // checkpoint 4
4:Next context on stack (0) is unknown:0x10dc520
4:Next context on stack (1) is ctx1:0x23eaa00
dstatus = cuCtxGetCurrent(&primary); // checkpoint 5
5:Next context on stack (0) is ctx2:0x10dc520
5:Next context on stack (1) is ctx1:0x32caf30
dstatus = cuCtxPushCurrent(ctx1); // checkpoint 6
6:Next context on stack (0) is ctx1:0x3a44ed0
6:Next context on stack (1) is ctx2:0x10dc520
6:Next context on stack (2) is ctx1:0x3a44ed0
rstatus = cudaMalloc(&ptr2, 1024); // checkpoint 7
7:Next context on stack (0) is ctx1:0x41cfd90
7:Next context on stack (1) is ctx2:0x10dc520
7:Next context on stack (2) is ctx1:0x41cfd90
rstatus = cudaSetDevice(0); // checkpoint 8
8:Next context on stack (0) is ctx2:0x10dc520
8:Next context on stack (1) is ctx2:0x10dc520
8:Next context on stack (2) is ctx1:0x4959c70