Cuda 在CMake中确定nvcc需要哪些gencode（计算、拱度）值_Cuda_Cmake_Build Automation_Detection

Cuda 在CMake中确定nvcc需要哪些gencode（计算、拱度）值

cuda cmake

Cuda 在CMake中确定nvcc需要哪些gencode（计算、拱度）值,cuda,cmake,build-automation,detection,Cuda,Cmake,Build Automation,Detection,我正在使用CMake作为我的代码的构建系统，它涉及CUDA。我正在考虑自动化任务，以决定需要将哪个compute\u XX和arch\u XX传递给我的nvcc，以便为我当前机器上的GPU进行编译有没有办法做到这一点：使用英伟达GPU部署工具包没有英伟达GPU部署工具包？ CMake的FindCUDA是否有助于您确定这些开关的值我的策略是编译并运行一个bash脚本，该脚本探测卡并返回cmake的gencode。灵感来自。要处理错误或多个GPU或其他情况，请根据需要进行修改在项目文件

我正在使用CMake作为我的代码的构建系统，它涉及CUDA。我正在考虑自动化任务，以决定需要将哪个

compute\u XX

和

arch\u XX

传递给我的nvcc，以便为我当前机器上的GPU进行编译

有没有办法做到这一点：
使用英伟达GPU部署工具包没有英伟达GPU部署工具包？
CMake的
```
FindCUDA
```
是否有助于您确定这些开关的值

我的策略是编译并运行一个bash脚本，该脚本探测卡并返回cmake的gencode。灵感来自。要处理错误或多个GPU或其他情况，请根据需要进行修改

在项目文件夹中创建一个文件cudaComputeVersion.bash，并确保该文件可从shell执行。将以下内容放入此文件：

#!/bin/bash

# create a 'here document' that is code we compile and use to probe the card
cat << EOF > /tmp/cudaComputeVersion.cu
#include <stdio.h>
int main()
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop,0);
int v = prop.major * 10 + prop.minor;
printf("-gencode arch=compute_%d,code=sm_%d\n",v,v);
}
EOF

# probe the card and cleanup
/usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion
/tmp/cudaComputeVersion
rm /tmp/cudaComputeVersion.cu
rm /tmp/cudaComputeVersion

cheers

与@orthopteroid的答案相比有了一点改进，它几乎确保生成一个唯一的临时文件，并且只需要一个而不是两个临时文件

以下内容进入脚本/get\u cuda\u sm.sh：

#!/bin/bash 
#
# Prints the compute capability of the first CUDA device installed
# on the system, or alternatively the device whose index is the
# first command-line argument

device_index=${1:-0}
timestamp=$(date +%s.%N)
gcc_binary=$(which g++)
gcc_binary=${gcc_binary:-g++}
cuda_root=${CUDA_DIR:-/usr/local/cuda}
CUDA_INCLUDE_DIRS=${CUDA_INCLUDE_DIRS:-${cuda_root}/include}
CUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY:-${cuda_root}/lib64/libcudart.so}
generated_binary="/tmp/cuda-compute-version-helper-$$-$timestamp"
# create a 'here document' that is code we compile and use to probe the card
source_code="$(cat << EOF 
#include <stdio.h>
#include <cuda_runtime_api.h>

int main()
{
        cudaDeviceProp prop;
        cudaError_t status;
        int device_count;
        status = cudaGetDeviceCount(&device_count);
        if (status != cudaSuccess) { 
                fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status)); 
                return -1;
        }
        if (${device_index} >= device_count) {
                fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", ${device_index}, device_count);
                return -1;
        }
        status = cudaGetDeviceProperties(&prop, ${device_index});
        if (status != cudaSuccess) { 
                fprintf(stderr,"cudaGetDeviceProperties() for device ${device_index} failed: %s\n", cudaGetErrorString(status)); 
                return -1;
        }
        int v = prop.major * 10 + prop.minor;
        printf("%d\\n", v);
}
EOF
)"
echo "$source_code" | $gcc_binary -x c++ -I"$CUDA_INCLUDE_DIRS" -o "$generated_binary" - -x none "$CUDA_CUDART_LIBRARY"

# probe the card and cleanup

$generated_binary
rm $generated_binary

在使用CMake 3.7或更新版本时，您可以使用模块中的宏，而无需任何其他脚本

include(FindCUDA)
set(CUDA_ARCH_LIST Auto CACHE STRING
    "List of CUDA architectures (e.g. Pascal, Volta, etc) or \
compute capability versions (6.1, 7.0, etc) to generate code for. \
Set to Auto for automatic detection (default)."
)
cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST})
list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})

例如，上面在我的机器上将

CUDA\u ARCH\u标志设置为-gencode ARCH=compute\u 61，code=sm\u 61。
用户可以配置CUDA\u ARCH\u LIST
cache变量，以生成特定计算能力的代码，而不是自动检测
注意：自CMake 3.10以来，FindCUDA模块已被弃用。但是，最新的CMake版本（v3.14）中似乎还没有提供与cuda\u select\u nvcc\u arch\u flags（）
宏等效的替代方法。有关更多详细信息，请参见CMake问题跟踪程序。
我不清楚您的具体设想。是否希望CMake检测生成系统中的所有NVIDIA GPU并查询每个GPU的计算能力（例如，通过调用NVIDIA smi
），然后根据结果生成-arch
标志列表？特别是在集群中，生成系统可能包含与启用GPU的集群节点完全不同的GPU，甚至根本没有GPU。您可以编写脚本来确定您的arch，然后使用必要的选项运行生成，或者您可以将所有可能的arch变体传递给编译器，在这种情况下，您的gpu所需的代码将被确定并加载到runtime@njuffa：嗯，默认情况下，您是为正在构建的系统构建的，所以-我想，是的。即使它只是查看一些它希望在其中看到这些信息的环境变量，这也将是一件了不起的事情。@vlad1slav：我知道我可以做到，但那将是重新发明轮子。这样的脚本不是已经存在了吗？它不应该是FindCUDA的一部分吗？很好。。。我下周试试看。目前为+1。然而。。。我需要让bash脚本使用FindCUDA获得的路径nvcc
@einpoklum Ya，我知道我没有涉及到这一点。可能会将arg传递给脚本。。。但我还没有尝试过。所以，我已经尝试了一段时间，并提出了一个（希望）更好的版本，您可能也会喜欢，请看我的答案。@Orthoteroid:如果您喜欢，您可以：1。使用它，看看是否有错误/角落案例我错过了和2。Upvote…@orthopteroid：另外，我认为它可以像您一样将源代码管道化到GCC二进制文件；我只是习惯了一种单一的赋值方式。我尝试过使用它，但从cmake得到了这个错误：bash:-c:line 1:意外标记“|”附近的语法错误bash:-c:line 1:`sed's/^\（[0-9]\）\（[0-9]\）/\1.\2/；”xargs echo-n'构建计算能力61
@tylerjw:可能您有空的${CUDA\u INCLUDE\u DIRS}或${CUDA\u CUDART\u LIBRARY}。你需要“找到库达”。如果没有-您需要更仔细地检查失败的地方/原因。但不管怎样，请看编辑。@tylerjw：你的意思是我刚刚在编辑中做了些改变？如果是，谢谢。我想引用不会有什么坏处。在CMake 3.8中，FindCUDA的使用不是被弃用了吗？确实如此。不过，正如您已经注意到的，似乎还没有任何相关的替代品。感谢您在CMake问题跟踪程序中创建相关问题！我在答案中加入了一个链接。
if (NOT CUDA_TARGET_COMPUTE_CAPABILITY)
    if("$ENV{CUDA_SM}" STREQUAL "")
        set(ENV{CUDA_INCLUDE_DIRS} "${CUDA_INCLUDE_DIRS}")
        set(ENV{CUDA_CUDART_LIBRARY} "${CUDA_CUDART_LIBRARY}")
        set(ENV{CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}")
        execute_process(COMMAND 
            bash -c "${CMAKE_CURRENT_SOURCE_DIR}/scripts/get_cuda_sm.sh" 
            OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY_)
    else()
        set(CUDA_TARGET_COMPUTE_CAPABILITY_ $ENV{CUDA_SM})
    endif()

    set(CUDA_TARGET_COMPUTE_CAPABILITY "${CUDA_TARGET_COMPUTE_CAPABILITY_}" 
        CACHE STRING "CUDA compute capability of the (first) CUDA device on \
        the system, in XY format (like the X.Y format but no dot); see table \
        of features and capabilities by capability X.Y value at \
        https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications")

    execute_process(COMMAND 
        bash -c "echo -n $(echo ${CUDA_TARGET_COMPUTE_CAPABILITY})" 
        OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY)
    execute_process(COMMAND 
        bash -c "echo ${CUDA_TARGET_COMPUTE_CAPABILITY} | sed 's/^\\([0-9]\\)\\([0-9]\\)/\\1.\\2/;' | xargs echo -n" 
        OUTPUT_VARIABLE FORMATTED_COMPUTE_CAPABILITY)

    message(STATUS 
        "CUDA device-side code will assume compute capability \
        ${FORMATTED_COMPUTE_CAPABILITY}")
endif()

set(CUDA_GENCODE
    "arch=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}, code=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode ${CUDA_GENCODE} )

include(FindCUDA)
set(CUDA_ARCH_LIST Auto CACHE STRING
    "List of CUDA architectures (e.g. Pascal, Volta, etc) or \
compute capability versions (6.1, 7.0, etc) to generate code for. \
Set to Auto for automatic detection (default)."
)
cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST})
list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})