cudaSetDevice()对CUDA设备有什么作用;什么是上下文堆栈?

cudaSetDevice()对CUDA设备有什么作用;什么是上下文堆栈?,cuda,cuda-context,cuda-driver,Cuda,Cuda Context,Cuda Driver,假设我有一个与设备I关联的活动CUDA上下文,现在我调用cudaSetDevice(I)。发生了什么事 什么都没有 主上下文是否替换堆栈的顶部 是否将主上下文推送到堆栈上 事实上,这似乎是不一致的。我编写了这个程序,在一台只有一个设备的机器上运行: #include <cuda.h> #include <cuda_runtime_api.h> #include <cassert> #include <iostream> int main() {

假设我有一个与设备
I
关联的活动CUDA上下文,现在我调用
cudaSetDevice(I)
。发生了什么事

  • 什么都没有
  • 主上下文是否替换堆栈的顶部
  • 是否将主上下文推送到堆栈上
  • 事实上,这似乎是不一致的。我编写了这个程序,在一台只有一个设备的机器上运行:

    #include <cuda.h>
    #include <cuda_runtime_api.h>
    #include <cassert>
    #include <iostream>
    
    int main()
    {
            CUcontext ctx1, primary;
            cuInit(0);
            auto status = cuCtxCreate(&ctx1, 0, 0);
            assert (status == (CUresult) cudaSuccess);
            cuCtxPushCurrent(ctx1);
            status = cudaSetDevice(0);
            assert (status == cudaSuccess);
            void* ptr1;
            void* ptr2;
            cudaMalloc(&ptr1, 1024);
            assert (status == cudaSuccess);
            cuCtxGetCurrent(&primary);
            assert (status == (CUresult) cudaSuccess);
            assert(primary != ctx1);
            status = cuCtxPushCurrent(ctx1);
            assert (status == (CUresult) cudaSuccess);
            cudaMalloc(&ptr2, 1024);
            assert (status == (CUresult) cudaSuccess);
            cudaSetDevice(0);
            assert (status == (CUresult) cudaSuccess);
            int i = 0;
            while (true) {
                    status = cuCtxPopCurrent(&primary);
                    if (status != (CUresult) cudaSuccess) { break; }
                    std::cout << "Next context on stack (" << i++ << ") is " << (void*) primary << '\n';
            }
    }
    
    这种行为似乎有时是一种替代,有时是一种推动


    发生了什么事;DR:根据您提供的代码,在这两种特定用法中,
    cudaSetDevice()
    似乎正在替换堆栈顶部的上下文

    让我们稍微修改一下代码,然后看看我们可以推断出代码中每个API调用对上下文堆栈的影响:

    $ cat t1759.cu
    #include <cuda.h>
    #include <cuda_runtime_api.h>
    #include <cassert>
    #include <iostream>
    void check(int j, CUcontext ctx1, CUcontext ctx2){
      CUcontext ctx0;
      int i = 0;
      while (true) {
                    auto status = cuCtxPopCurrent(&ctx0);
                    if (status != CUDA_SUCCESS) { break; }
                    if (ctx0 == ctx1) std::cout << j << ":Next context on stack (" << i++ << ") is ctx1:" << (void*) ctx0 << '\n';
                    else if (ctx0 == ctx2) std::cout << j << ":Next context on stack (" << i++ << ") is ctx2:" << (void*) ctx0 << '\n';
                    else std::cout << j << ":Next context on stack (" << i++ << ") is unknown:" << (void*) ctx0 << '\n';
      }
    }
    void runtest(int i)
    {
            CUcontext ctx1, primary = NULL;
            cuInit(0);
            auto dstatus = cuCtxCreate(&ctx1, 0, 0);    // checkpoint 1
            assert (dstatus == CUDA_SUCCESS);
            if (i == 1) {check(i,ctx1,primary); return;}// checkpoint 1
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 2
            assert (dstatus == CUDA_SUCCESS);
            if (i == 2) {check(i,ctx1,primary); return;}// checkpoint 2
            auto rstatus = cudaSetDevice(0);            // checkpoint 3
            assert (rstatus == cudaSuccess);
            if (i == 3) {check(i,ctx1,primary); return;}// checkpoint 3
            void* ptr1;
            void* ptr2;
            rstatus = cudaMalloc(&ptr1, 1024);          // checkpoint 4
            assert (rstatus == cudaSuccess);
            if (i == 4) {check(i,ctx1,primary); return;}// checkpoint 4
            dstatus = cuCtxGetCurrent(&primary);        // checkpoint 5
            assert (dstatus == CUDA_SUCCESS);
            assert(primary != ctx1);
            if (i == 5) {check(i,ctx1,primary); return;}// checkpoint 5
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 6
            assert (dstatus == CUDA_SUCCESS);
            if (i == 6) {check(i,ctx1,primary); return;}// checkpoint 6
            rstatus = cudaMalloc(&ptr2, 1024);          // checkpoint 7
            assert (rstatus == cudaSuccess);
            if (i == 7) {check(i,ctx1,primary); return;}// checkpoint 7
            rstatus = cudaSetDevice(0);                 // checkpoint 8
            assert (rstatus == cudaSuccess);
            if (i == 8) {check(i,ctx1,primary); return;}// checkpoint 8
            return;
    }
    
    int main(){
            for (int i = 1; i < 9; i++){
              cudaDeviceReset();
              runtest(i);}
    }
    $ nvcc -o t1759 t1759.cu -lcuda -std=c++11
    $ ./t1759
    1:Next context on stack (0) is ctx1:0x11087e0
    2:Next context on stack (0) is ctx1:0x1741160
    2:Next context on stack (1) is ctx1:0x1741160
    3:Next context on stack (0) is unknown:0x10dc520
    3:Next context on stack (1) is ctx1:0x1c5aa70
    4:Next context on stack (0) is unknown:0x10dc520
    4:Next context on stack (1) is ctx1:0x23eaa00
    5:Next context on stack (0) is ctx2:0x10dc520
    5:Next context on stack (1) is ctx1:0x32caf30
    6:Next context on stack (0) is ctx1:0x3a44ed0
    6:Next context on stack (1) is ctx2:0x10dc520
    6:Next context on stack (2) is ctx1:0x3a44ed0
    7:Next context on stack (0) is ctx1:0x41cfd90
    7:Next context on stack (1) is ctx2:0x10dc520
    7:Next context on stack (2) is ctx1:0x41cfd90
    8:Next context on stack (0) is ctx2:0x10dc520
    8:Next context on stack (1) is ctx2:0x10dc520
    8:Next context on stack (2) is ctx1:0x4959c70
    $
    
    上下文创建还将新创建的上下文推送到堆栈上,如前所述

    二,

    毫不奇怪,在堆栈上推送相同的上下文会为其创建另一个堆栈条目

    三,

    cudaSetDevice()
    调用已将堆栈顶部替换为“未知”上下文。(此时未知,因为我们尚未检索“其他”上下文的句柄值)

    四,

    由于此调用,堆栈配置没有差异

    五,

    由于此调用,堆栈配置没有差异,但我们现在知道堆栈顶部上下文是当前上下文(我们可以推测它是主上下文)

    六,

    这里没有真正的惊喜。我们在堆栈上按下
    ctx1
    ,因此堆栈有3个条目,第一个条目是驱动程序API创建的上下文,接下来的两个条目与步骤5中的堆栈配置相同,只是向下移动了一个堆栈位置

    七,

    同样,此调用对堆栈配置没有影响

    八,

    我们再次看到,这里的行为是
    cudaSetDevice()
    调用已将堆栈顶部上下文替换为主上下文

    我从您的测试代码中得出的结论是,当与代码中的各种运行时和驱动程序API调用混合时,
    cudaSetDevice()
    调用的行为没有不一致


    在我看来,这种编程范式是疯狂的。我无法想象为什么你会希望以这种方式混合驱动程序API和运行时API代码。

    哦,不,我不想这样做,这是一个人工示例,我只是想了解其行为,这样当我将API包装扩展到驱动程序API时,我就不会因为错误的假设而把事情搞砸。现在,我有了“推”和“弹出”的代码当前设备-假设仅使用运行时API。既然您已经阐明了
    cudaSetDevice()
    的行为方式,我可以将该代码更改为:1。查看当前上下文。2.把它放在一边。3.
    cudaSetDevice()
    到我想使用运行时API 4使用的设备。做我的工作。5.
    cuCtxSetCurrent()
    到我保存的上下文中。通过编辑我的答案,您做出了一个我不满意的声明,并有效地将该声明归因于我。如果你不编辑我的答案可能会更好(除非我将它们标记为社区维基)。我对此感到不舒服,我建议今后不要对我的答案如此随意。我很乐意对我观察到的东西进行陈述。就这样吧。如果您希望进一步了解,我的建议是通过developer.nvidia.com上的bug归档门户提出此类请求。如果您希望澄清,请使用评论。此外,我不会对您现在在评论中列出的5个步骤进行说明。我将尊重您的建议和要求。
    $ cat t1759.cu
    #include <cuda.h>
    #include <cuda_runtime_api.h>
    #include <cassert>
    #include <iostream>
    void check(int j, CUcontext ctx1, CUcontext ctx2){
      CUcontext ctx0;
      int i = 0;
      while (true) {
                    auto status = cuCtxPopCurrent(&ctx0);
                    if (status != CUDA_SUCCESS) { break; }
                    if (ctx0 == ctx1) std::cout << j << ":Next context on stack (" << i++ << ") is ctx1:" << (void*) ctx0 << '\n';
                    else if (ctx0 == ctx2) std::cout << j << ":Next context on stack (" << i++ << ") is ctx2:" << (void*) ctx0 << '\n';
                    else std::cout << j << ":Next context on stack (" << i++ << ") is unknown:" << (void*) ctx0 << '\n';
      }
    }
    void runtest(int i)
    {
            CUcontext ctx1, primary = NULL;
            cuInit(0);
            auto dstatus = cuCtxCreate(&ctx1, 0, 0);    // checkpoint 1
            assert (dstatus == CUDA_SUCCESS);
            if (i == 1) {check(i,ctx1,primary); return;}// checkpoint 1
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 2
            assert (dstatus == CUDA_SUCCESS);
            if (i == 2) {check(i,ctx1,primary); return;}// checkpoint 2
            auto rstatus = cudaSetDevice(0);            // checkpoint 3
            assert (rstatus == cudaSuccess);
            if (i == 3) {check(i,ctx1,primary); return;}// checkpoint 3
            void* ptr1;
            void* ptr2;
            rstatus = cudaMalloc(&ptr1, 1024);          // checkpoint 4
            assert (rstatus == cudaSuccess);
            if (i == 4) {check(i,ctx1,primary); return;}// checkpoint 4
            dstatus = cuCtxGetCurrent(&primary);        // checkpoint 5
            assert (dstatus == CUDA_SUCCESS);
            assert(primary != ctx1);
            if (i == 5) {check(i,ctx1,primary); return;}// checkpoint 5
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 6
            assert (dstatus == CUDA_SUCCESS);
            if (i == 6) {check(i,ctx1,primary); return;}// checkpoint 6
            rstatus = cudaMalloc(&ptr2, 1024);          // checkpoint 7
            assert (rstatus == cudaSuccess);
            if (i == 7) {check(i,ctx1,primary); return;}// checkpoint 7
            rstatus = cudaSetDevice(0);                 // checkpoint 8
            assert (rstatus == cudaSuccess);
            if (i == 8) {check(i,ctx1,primary); return;}// checkpoint 8
            return;
    }
    
    int main(){
            for (int i = 1; i < 9; i++){
              cudaDeviceReset();
              runtest(i);}
    }
    $ nvcc -o t1759 t1759.cu -lcuda -std=c++11
    $ ./t1759
    1:Next context on stack (0) is ctx1:0x11087e0
    2:Next context on stack (0) is ctx1:0x1741160
    2:Next context on stack (1) is ctx1:0x1741160
    3:Next context on stack (0) is unknown:0x10dc520
    3:Next context on stack (1) is ctx1:0x1c5aa70
    4:Next context on stack (0) is unknown:0x10dc520
    4:Next context on stack (1) is ctx1:0x23eaa00
    5:Next context on stack (0) is ctx2:0x10dc520
    5:Next context on stack (1) is ctx1:0x32caf30
    6:Next context on stack (0) is ctx1:0x3a44ed0
    6:Next context on stack (1) is ctx2:0x10dc520
    6:Next context on stack (2) is ctx1:0x3a44ed0
    7:Next context on stack (0) is ctx1:0x41cfd90
    7:Next context on stack (1) is ctx2:0x10dc520
    7:Next context on stack (2) is ctx1:0x41cfd90
    8:Next context on stack (0) is ctx2:0x10dc520
    8:Next context on stack (1) is ctx2:0x10dc520
    8:Next context on stack (2) is ctx1:0x4959c70
    $
    
            auto dstatus = cuCtxCreate(&ctx1, 0, 0);    // checkpoint 1
    1:Next context on stack (0) is ctx1:0x11087e0
    
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 2
    2:Next context on stack (0) is ctx1:0x1741160
    2:Next context on stack (1) is ctx1:0x1741160
    
            auto rstatus = cudaSetDevice(0);            // checkpoint 3
    3:Next context on stack (0) is unknown:0x10dc520
    3:Next context on stack (1) is ctx1:0x1c5aa70
    
            rstatus = cudaMalloc(&ptr1, 1024);          // checkpoint 4
    4:Next context on stack (0) is unknown:0x10dc520
    4:Next context on stack (1) is ctx1:0x23eaa00
    
            dstatus = cuCtxGetCurrent(&primary);        // checkpoint 5
    5:Next context on stack (0) is ctx2:0x10dc520
    5:Next context on stack (1) is ctx1:0x32caf30
    
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 6
    6:Next context on stack (0) is ctx1:0x3a44ed0
    6:Next context on stack (1) is ctx2:0x10dc520
    6:Next context on stack (2) is ctx1:0x3a44ed0
    
            rstatus = cudaMalloc(&ptr2, 1024);          // checkpoint 7
    7:Next context on stack (0) is ctx1:0x41cfd90
    7:Next context on stack (1) is ctx2:0x10dc520
    7:Next context on stack (2) is ctx1:0x41cfd90
    
            rstatus = cudaSetDevice(0);                 // checkpoint 8
    8:Next context on stack (0) is ctx2:0x10dc520
    8:Next context on stack (1) is ctx2:0x10dc520
    8:Next context on stack (2) is ctx1:0x4959c70