Sorting 设备上的按键推力排序抛出错误_Sorting_Cuda_Thrust

Sorting 设备上的按键推力排序抛出错误

sorting cuda

Sorting 设备上的按键推力排序抛出错误,sorting,cuda,thrust,Sorting,Cuda,Thrust,我有一个设备浮点数组，我尝试使用这个函数按键对其进行排序： #include <thrust/sort.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/device_ptr.h> #include <thrust/device_malloc.h> #include <thrust/device_free.h&

我有一个设备浮点数组，我尝试使用这个函数按键对其进行排序：

#include <thrust/sort.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>    

template <typename T>
__host__ T* deepCopyDeviceArray(T* dev_array, int arraysize)
{
// performs a deep copy of a device array and returns the copy's device pointer

cudaError_t cudaStatus;

T* dev_copiedArray;

cudaStatus = cudaMalloc((void**)&dev_copiedArray, (arraysize * sizeof(T)));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "deep copy cudaMalloc failed!");
}

cudaStatus = cudaMemcpy(dev_copiedArray, dev_array, (arraysize * sizeof(T)), cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "deep copy cudaMemcpy failed!");
}

return dev_copiedArray;
}



template <typename T>
int* sortByKeyOnDevice(T* dev_keys, int len, const int* valuesarray)
{
// sorts keysarray and returns the sorted indices
T* dev_keys2 = deepCopyDeviceArray(dev_keys, len); // make deep copy to evade change of original keys

// make deep copy of values and copy it to device
int* dev_values;
cudaMalloc((void **) &dev_values, len);
cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);

// create device pointers
thrust::device_ptr<T> dev_ptr_keys = thrust::device_pointer_cast(dev_keys2);
thrust::device_ptr<int> dev_ptr_values = thrust::device_pointer_cast(dev_values);

thrust::sort_by_key(dev_ptr_keys, dev_ptr_keys + len, dev_ptr_values);

//thrust::device_free(dev_ptr_keys);
cudaFree(dev_keys2);
return dev_values; // return only indices of sorted array
}

int main()
{
int len = 10;
float* array1 = new float[len]; for (int i=0;i<len;i++) array1[i] = rand();

float* dev_array1;
cudaMalloc(&dev_array1, len * sizeof(float));
cudaMemcpy(dev_array1, array1, (len * sizeof(float)), cudaMemcpyHostToDevice);

int* valuesarray = new int[len]; for (int i=0; i<len; i++) valuesarray[i] = i;
int* dev_values;

dev_values = sortByKeyOnDevice(dev_array1, len, valuesarray);

int* values = new int[len];
cudaMemcpy(values, dev_values, (len * sizeof(int)), cudaMemcpyDeviceToHost); // or use dev_values in a kernel for further calculations
}

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
模板
__主机uuu*deepCopyDeviceArray（T*dev\u阵列，int阵列大小）
{
//执行设备数组的深度复制并返回副本的设备指针
cudaError\u t cudaStatus；
T*开发副本阵列；
cudaStatus=cudamaloc（（void**）和dev_copiedaray（arraysize*sizeof（T））；
if（cudaStatus！=cudaSuccess）{
fprintf（stderr，“深度复制cudaMalloc失败！”）；
}
cudaStatus=cudaMemcpy（dev_copiedArray，dev_array，（arraysize*sizeof（T）），cudaMemcpyDeviceToDevice）；
if（cudaStatus！=cudaSuccess）{
fprintf（stderr，“深度复制cudaMemcpy失败！”）；
}
返回dev_copiedaray；
}
模板
int*sortByKeyOnDevice（T*开发密钥、int len、const int*值数组）
{
//对keysarray排序并返回排序后的索引
T*dev_keys2=deepCopyDeviceArray（dev_keys，len）；//进行深度复制以避免更改原始密钥
//对值进行深度复制并将其复制到设备
int*dev_值；
cudamaloc（（void**）和dev_值，len）；
cudaMemcpy（dev_值、valuesarray、len*sizeof（int）、cudamemcpyhostodevice）；
//创建设备指针
推力：：设备\u ptr开发\u ptr\u键=推力：：设备\u指针\u转换（开发键2）；
推力：：设备\u ptr开发\u ptr\u值=推力：：设备\u指针\u转换（开发值）；
推力：按键排序（dev_ptr_键、dev_ptr_键+len、dev_ptr_值）；
//推力：无设备（dev_ptr_键）；
cudaFree（开发键2）；
return dev_value；//只返回排序数组的索引
}
int main（）
{
int len=10；
float*array1=new float[len]；for（int i=0；i每当您在使用CUDA代码时遇到问题，您应该在每个CUDA API调用和内核调用上设置到位（对于推力调用，您不需要这样做，它们有自己的错误报告机制）。您还可以使用cuda memcheck
运行代码，即使未明确检查API错误，也会显示API错误
如果您这样做了，您会发现这行代码正在报告API错误（无效参数）：
查看前一行，您的尺寸参数不正确：
cudaMalloc((void **) &dev_values, len);

应该是：
cudaMalloc((void **) &dev_values, len*sizeof(int));

通过这一更改，您的代码为我编译并运行时不会出现任何错误
另外，在发布代码时，请正确设置格式（缩进）让其他人更容易阅读。
什么是deepCopyDeviceArray
？发布完整的代码。#包括语句、数据生成、复制、粘贴、编译和运行所需的所有内容。因此希望如此。投票结束。很抱歉…此代码现在应使用CUDA 5.5.6编译
cudaMalloc((void **) &dev_values, len*sizeof(int));