opencl设备中的首选矢量宽度
我是OpenCL的初学者,正在尝试运行“OpenLC在运行”一书中的示例代码。我有以下代码来获得我的设备的首选向量宽度。在我的计算机上检测到的平台来自Intel Core i7和HD graphics,另一个来自NVIDIA GeForce 940M。每当我运行代码时,它对每种类型的向量宽度都给出“1”,除非类型double为零,因为它不受支持。即使我改变电脑的平台来检查它的设备,结果也是一样的。我在AMD电脑上运行了这段代码,它似乎工作正常,因为它给了我不同类型的不同数字。但是,我不知道为什么这段代码在我的计算机的不同平台上对每种类型都给我“1”。有什么想法吗? 以下是输出: 代码如下:opencl设备中的首选矢量宽度,opencl,Opencl,我是OpenCL的初学者,正在尝试运行“OpenLC在运行”一书中的示例代码。我有以下代码来获得我的设备的首选向量宽度。在我的计算机上检测到的平台来自Intel Core i7和HD graphics,另一个来自NVIDIA GeForce 940M。每当我运行代码时,它对每种类型的向量宽度都给出“1”,除非类型double为零,因为它不受支持。即使我改变电脑的平台来检查它的设备,结果也是一样的。我在AMD电脑上运行了这段代码,它似乎工作正常,因为它给了我不同类型的不同数字。但是,我不知道为什么
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
int main(){
cl_int err, i, j;
cl_platform_id *platforms;
cl_device_id *devices;
cl_uint num_platforms, num_devices, vector_width;
size_t plat_name_size, devi_name_size;
char *plat_name_data, *devi_name_data;
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0){
perror("No platform is found");
exit(1);
}
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
printf("Number of found platforms is %d\n ", num_platforms);
for (i = 0; i < num_platforms; i++){
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
if (err < 0){
perror("Couldn't read platform name.");
exit(1);
}
plat_name_data = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
printf("Platform No.%d is: %s\n", i, plat_name_data);
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
if (err < 0){
perror("No device is found in this platform");
exit(1);
}
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
printf("Number of devices found in this platform is: %d\n", num_devices);
for (j = 0; j < num_devices; j++){
err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
if (err < 0){
perror("Couldn't read the device name.");
exit(1);
}
devi_name_data = (char*)malloc(devi_name_size);
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
if (strstr(devi_name_data, "GeForce 940M")){
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in chars: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in shorts: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in ints: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in longs: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in floats: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in doubles: %u\n", vector_width);
}
}
}
return 0;
}
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"
#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err == CL_DEVICE_NOT_FOUND) {
printf(" GPU is not first! Going on CPU :(");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel[NUM_KERNELS];
cl_command_queue queue;
cl_event prof_event;
cl_int i, j, err, preferred_width;
size_t local_size, global_size;
char kernel_names[NUM_KERNELS][20] =
{ "reduction_scalar", "reduction_vector" };
/* Data and buffers */
float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
//float data[ARRAY_SIZE];
float sum, actual_sum, *scalar_sum, *vector_sum;
cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
cl_int num_groups;
cl_ulong time_start, time_end, total_time;
/* Initialize data */
for (i = 0; i<ARRAY_SIZE; i++) {
data[i] = 1.0f*i;
}
/* Create device and determine local size */
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(preferred_width), &preferred_width, NULL);
printf("Preferred vector width: %d\n", preferred_width);
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(local_size), &local_size, NULL);
if (err < 0) {
perror("Couldn't obtain device information");
exit(1);
}
/* Allocate and initialize output arrays */
num_groups = ARRAY_SIZE / local_size;
scalar_sum = (float*)malloc(num_groups * sizeof(float));
vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
for (i = 0; i<num_groups; i++) {
scalar_sum[i] = 0.0f;
}
for (i = 0; i<num_groups / 4; i++) {
vector_sum[i] = 0.0f;
}
/* Create a context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
if (err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
for (i = 0; i<NUM_KERNELS; i++) {
/* Create a kernel */
kernel[i] = clCreateKernel(program, kernel_names[i], &err);
if (err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create kernel arguments */
err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
if (i == 0) {
global_size = ARRAY_SIZE;
err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
}
else {
global_size = ARRAY_SIZE / 4;
err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
}
if (err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Finish processing the queue and get profiling information */
clFinish(queue);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
/* Read the result */
if (i == 0) {
err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups; j++) {
sum += scalar_sum[j];
}
}
else {
err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups / 4; j++) {
sum += vector_sum[j];
}
}
/* Check result */
printf("%s: ", kernel_names[i]);
actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
if (fabs(sum - actual_sum) > 0.01*fabs(sum))
printf("Check failed.\n");
else
printf("Check passed.\n");
printf("Total time = %lu\n\n", total_time);
/* Deallocate event */
clReleaseEvent(prof_event);
}
/* Deallocate resources */
free(scalar_sum);
free(vector_sum);
for (i = 0; i<NUM_KERNELS; i++) {
clReleaseKernel(kernel[i]);
}
clReleaseMemObject(scalar_sum_buffer);
clReleaseMemObject(vector_sum_buffer);
clReleaseMemObject(data_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
#包括
#包括
#包括
#包括
int main(){
cl_int err,i,j;
cl_平台_id*平台;
cl_设备\u id*设备;
cl_uint num_平台、num_设备、矢量宽度;
尺寸、平台名称、设备名称、尺寸;
字符*平台名称数据,*设备名称数据;
err=clGetPlatformIDs(1、NULL和num_平台);
如果(误差<0){
perror(“未找到平台”);
出口(1);
}
平台=(cl_平台id*)malloc(sizeof(cl_平台id)*num_平台);
clGetPlatformIDs(num_平台,platforms,NULL);
printf(“找到的平台数为%d\n”,num_platforms);
对于(i=0;i
简短回答:您正确地查询了它,平台编译器知道最佳向量宽度大小。是的,1的值是正确的
详细答案:对于CPU(任何类型的CPU),它可能更喜欢非矢量化。特别是在“英特尔CPU+编译器”上,由于“英特尔编译器”将矢量化作为优化过程的一部分,因此它不希望用户首先对代码进行矢量化
事实上,看起来nVIDIA也更喜欢用户输入非矢量化代码。这并不意味着,如果已经矢量化,代码将运行得更慢。这只是意味着编译器(由于它有优化技术)更喜欢代码不可分割
对OpenCL驱动程序的更新可能会导致这些值的更改。
而且,你应该把它们当作定向词。其他因素如:本地内存使用、合并的全局访问、本地大小等。。。通常更重要。简短回答:您正确地查询了它,平台编译器知道什么是最佳向量宽度大小。是的,1的值是正确的 详细答案:对于CPU(任何类型的CPU),它可能更喜欢非矢量化。特别是在“英特尔CPU+编译器”上,由于“英特尔编译器”将矢量化作为优化过程的一部分,因此它不希望用户首先对代码进行矢量化 事实上,看起来nVIDIA也更喜欢用户输入非矢量化代码。这并不意味着,如果已经矢量化,代码将运行得更慢。这只是意味着编译器(由于它有优化技术)更喜欢代码不可分割 对OpenCL驱动程序的更新可能导致c
__kernel void reduction_scalar(__global float* data,
__local float* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = partial_sums[0];
}
}
__kernel void reduction_vector(__global float4* data,
__local float4* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
}
}