Cuda cub库支持的最大大小
有人知道cub::scan支持的最大大小是多少吗?我得到了输入大小超过5亿的内核转储。我想确定我没有做错什么 这是我的密码:Cuda cub库支持的最大大小,cuda,nvidia,cub,prefix-sum,Cuda,Nvidia,Cub,Prefix Sum,有人知道cub::scan支持的最大大小是多少吗?我得到了输入大小超过5亿的内核转储。我想确定我没有做错什么 这是我的密码: #define CUB_STDERR #include <stdio.h> #include "cub/util_allocator.cuh" #include "cub/device/device_scan.cuh" #include <sys/time.h> using namespace cub; bool
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
#定义立方体标准
#包括
#包括“cub/util_allocator.cuh”
#包括“cub/device/device_scan.cuh”
#包括
使用名称空间cub;
bool g_verbose=false;//是否显示控制台的输入/输出
CachingDeviceAllocator g_分配器(true);//设备内存的缓存分配器
typedef int-mytype;
/**
*解决包容性扫描问题
*/
静态void solve(mytype*h_-in,mytype*h_-cpu,int-n)
{
mytype=0;
对于(int i=0;i
问题在于:
const int size = num_items * sizeof(mytype);
可以通过将其更改为:
const size_t size = num_items * sizeof(mytype);
代码中num\u项的值超过10亿。当我们将它乘以(mytype)的大小时,我们将它乘以4,因此结果超过40亿。此值不能存储在int
变量中。如果您尝试这样使用它,那么后续的主机代码将做坏事。这个问题(核心转储)实际上与CUDA无关。如果删除了所有CUB元素,代码将进行核心转储
当我修改上面的代码行,并针对正确的GPU进行编译(例如,在我的情况下是-arch=sm_35
,或者对于Titan X GPU是-arch=sm_52
),然后我得到了正确的答案(并且没有seg故障/堆芯转储)
一般来说,追踪seg故障/核心转储类型错误时,正确的起点是识别此错误是由主机代码引起的,您应该尝试定位生成此错误的源代码的确切行。这可以通过在代码中放入许多printf
语句来完成,直到您确定代码行之后没有看到任何printf输出,或者通过使用主机代码调试器(如linux上的gdb)来完成
还要注意,编写的这段代码在主机上需要略多于12GB的内存,在GPU上需要略多于8GB的内存,因此它只能在这种设置下正常运行
以下是固定代码(基于OP发布的内容),仅供参考:
#定义立方体标准
#包括
#包括“cub/util_allocator.cuh”
#包括“cub/device/device_scan.cuh”
#包括
使用名称空间cub;
bool g_verbose=false;//是否显示控制台的输入/输出
CachingDeviceAllocator g_分配器(true);//设备内存的缓存分配器
typedef int-mytype;
/**
*解决包容性扫描问题
*/
静态void solve(mytype*h_-in,mytype*h_-cpu,int-n)
{
mytype=0;
对于(int i=0;i#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}