Visual studio 2013 为什么我的CUDA函数的主机版本不执行？_Visual Studio 2013_Cuda_Binary Search_Thrust

Visual studio 2013 为什么我的CUDA函数的主机版本不执行？

visual-studio-2013 cuda

Visual studio 2013 为什么我的CUDA函数的主机版本不执行？,visual-studio-2013,cuda,binary-search,thrust,Visual Studio 2013,Cuda,Binary Search,Thrust,我写了一个CUDA程序，在1000000个不同的数组上运行1000000个二进制搜索（上限），每个数组有100个元素。为简单起见，所有1000000个阵列已合并/展平为一个包含100000000个元素的大型阵列。请记住，每个二叉搜索树的搜索值都是“固定”的（一个常量填充了最终将使用随机数的位置）我正在尝试对我的Nvidia图形卡与我的CPU相比的加速（或速度损失）进行基准测试。下面的代码描述了我当前的程序，但CPU部分不断出现“0”[秒]的计时，我使用Visual Studio 2013的断点

我写了一个CUDA程序，在1000000个不同的数组上运行1000000个二进制搜索（上限），每个数组有100个元素。为简单起见，所有1000000个阵列已合并/展平为一个包含100000000个元素的大型阵列。请记住，每个二叉搜索树的搜索值都是“固定”的（一个常量填充了最终将使用随机数的位置）

我正在尝试对我的Nvidia图形卡与我的CPU相比的加速（或速度损失）进行基准测试。下面的代码描述了我当前的程序，但CPU部分不断出现“0”[秒]的计时，我使用Visual Studio 2013的断点似乎被忽略。你知道发生了什么事吗？我怀疑我正在使用（或未能使用）的推力功能可能是罪魁祸首

我的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust\random.h>
#include <thrust\generate.h>
#include <thrust\copy.h>
#include <thrust\device_vector.h>
#include <curand_kernel.h>

#include <stdio.h>
#include <time.h>
#include <algorithm>
#include <cstdlib>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

void fillArrayWithRandom(float* inputArray, int inputN)
{
    for (int i = 0; i < inputN; i++)
    {
        inputArray[i] = (float)rand() / float(RAND_MAX);
    }
}

__global__ void warmUp()
{
}

__host__ __device__ int findTarget(float* inputArray, int startingIndex, int endingIndex, float targetValue)
{
    int length = endingIndex - startingIndex;
    if (length > 1)
    {
        int leftSearchIndex = startingIndex + length / 2 + length % 2;
        int rightSearchIndex = endingIndex;
        float leftSearchValue = inputArray[leftSearchIndex];
        float rightSearchValue = inputArray[rightSearchIndex];
        if (leftSearchValue > targetValue)
        {
            return findTarget(inputArray, startingIndex, leftSearchIndex, targetValue);
        }
        else if (rightSearchValue > targetValue)
        {
            return findTarget(inputArray, leftSearchIndex + 1, rightSearchIndex, targetValue);
        }
        else
        {
            return -1;
        }
    }
    else if (inputArray[startingIndex] > targetValue)
    {
        return startingIndex;
    }
    else if (inputArray[endingIndex] > targetValue)
    {
        return endingIndex;
    }
    else
    {
        return -1;
    }
}

__global__ void findTargets(float* inputArray, int numSubElements, int numTrajectories)
{
    int tId = threadIdx.x + (blockIdx.x * blockDim.x);
    while (tId < numTrajectories)
    {
        int beginIndex = tId*numSubElements;
        int endIndex = beginIndex + numSubElements - 1;

        float randomNumber = 0.5; //static for simplicity
        float searchVal = inputArray[endIndex] * randomNumber;

        findTarget(inputArray, beginIndex, endIndex, searchVal);
        tId += blockDim.x * gridDim.x;
    }

}

int main()
{
    //Initiate example data
    int numTrajectories = 1000000;
    int numSubElements = 100;
    int totalNumElements = numSubElements*numTrajectories; // Size of vector
    thrust::host_vector<float> rVec(totalNumElements);
    thrust::host_vector<float> rVec2(totalNumElements);
    fillArrayWithRandom((float *) &rVec[0], totalNumElements);
    fillArrayWithRandom((float *) &rVec2[0], totalNumElements);
    thrust::device_vector<float> d_vec = rVec;
    thrust::device_vector<float> o_vec(totalNumElements);
    thrust::inclusive_scan(d_vec.begin(), d_vec.end(), o_vec.begin());

    //GPU timing
    warmUp <<<1, 1 >>>();
    int threadsPerBlock = 1024;
    float time_spent_GPU, time_spent_CPU;
    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));
    gpuErrchk(cudaEventRecord(start, 0));
    findTargets <<< (numTrajectories + threadsPerBlock - 1)/threadsPerBlock, threadsPerBlock >>>((float *) thrust::raw_pointer_cast(d_vec.data()), numSubElements, numTrajectories);
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time_spent_GPU, start, stop));

    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        exit(-1);
    }

    time_spent_GPU = (double)(time_spent_GPU / 1000);
    double avg_GPU = time_spent_GPU / numTrajectories;
    printf("Avg. GPU Simulation Time: %.17g [sim/sec]\n", avg_GPU);

    //CPU Timing
    clock_t begin_CPU, end_CPU;
    begin_CPU = clock();
    float* rightPointer = &rVec2[0];
    for (int i = 0; i < numTrajectories; ++i)
    {
        float randomNumber = 0.5; //static for simplicity
        float searchVal = rightPointer[i*numSubElements + numSubElements - 1] * randomNumber;
        findTarget(rightPointer, i*numSubElements, i*numSubElements + numSubElements -1, searchVal);
    }
    end_CPU = clock();
    time_spent_CPU = (double)(end_CPU - begin_CPU) / CLOCKS_PER_SEC;
    double avg_CPU = time_spent_CPU / numTrajectories;
    printf("Avg. CPU Simulation Time: %.17g [sim/sec]\n", avg_CPU);
    printf("CPU/GPU Timing:%.17gx \n", avg_CPU/avg_GPU);

    return 0;
}

正如Talonmes已经指出的那样，时钟分辨率在ms中。不要使用它。相反，尝试使用boost chrono。直接在cuda中使用它是一个问题，因此请使用以下作为标题（

Timer.h

）：

随着cpp的发展

#include "Timer.h"
#include <boost/chrono.hpp>

// Nanoseconds
boost::chrono::high_resolution_clock::time_point start, finish;

void startTimer()
{
    start = boost::chrono::high_resolution_clock::now();
}

void endTimer()
{
    finish = boost::chrono::high_resolution_clock::now();
}

double totalTime() {
    return (finish - start).count() / (1e9);
}

与

及

与

及

与

这导致

Avg. GPU Simulation Time: 1.7804799301579521e-010 [sim/sec]
Avg. CPU Simulation Time: 6.4100000264286083e-013 [sim/sec]
CPU/GPU Timing:0.0036001529238579829x

（请注意，我目前正在我的GPU上运行其他东西，因此这些数字可能不太相关）

事实证明，编译器实际上足够聪明，能够意识到我没有对主机端的“findTarget”例程执行任何操作，因此它完全将其从编译代码中删除，即它甚至没有被执行（因此解释了为什么大幅增加模拟计数没有任何作用，断点也没有得到尊重）。

我会非常怀疑在宿主向量上使用

raw\u pointer\u cast

。试试像

float*righpointer=&rVec2[0]这样平淡无奇的东西。

谢谢，我刚刚尝试过（也更新了代码）。不幸的是，似乎没有做到这一点。让我感到奇怪的是，我不能在for循环中的任何东西上设置断点。在VS 2012中，它对我来说运行良好。您的生成/调试属性是什么样的？（只是为了显示，这是在调试模式下编译和运行时的输出：平均GPU模拟时间：8.0629449783486962e-009[sim/sec]平均CPU模拟时间：1.2300000662435195e-007[sim/sec]CPU/GPU计时：15.254972836183555x）有意思，谢谢-不确定发生了什么。我刚刚发布了我的编译器选项/输出。此外，我还尝试通过简化替换“推力”来使用Github的最新版本更新我在“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include”的推力安装文件夹-也许这可能会导致我出现问题？只是尝试用旧的推力替换“推力”目录，但没有效果。此外，尝试删除“-DCUB_CDP”我之前使用CUB库时有过选择。谢谢-现在下载Boost库，完成后会立即报告。结果证明这不是时间问题，而是一个微妙的编译器优化（它完全摆脱了for循环代码，因为它没有在其他地方使用）。不过，感谢您提供的高分辨率计时代码，我一定会开始使用它而不是clock（）。

#include "Timer.h"
#include <boost/chrono.hpp>

// Nanoseconds
boost::chrono::high_resolution_clock::time_point start, finish;

void startTimer()
{
    start = boost::chrono::high_resolution_clock::now();
}

void endTimer()
{
    finish = boost::chrono::high_resolution_clock::now();
}

double totalTime() {
    return (finish - start).count() / (1e9);
}

begin_CPU = clock();

startTimer();

end_CPU = clock();

endTimer();

time_spent_CPU = (double)(end_CPU - begin_CPU) / CLOCKS_PER_SEC;

time_spent_CPU = totalTime();

Avg. GPU Simulation Time: 1.7804799301579521e-010 [sim/sec]
Avg. CPU Simulation Time: 6.4100000264286083e-013 [sim/sec]
CPU/GPU Timing:0.0036001529238579829x