OpenCL固定内存与堆内存_Opencl

OpenCL固定内存与堆内存

opencl

OpenCL固定内存与堆内存,opencl,Opencl,我编写了一个示例程序，以了解GPU/CPU固定内存和堆内存的影响。下面的代码说明了这一点。我已经分配了三个尺寸为1280x720的缓冲区。我用一些数据填充了缓冲区1和2，然后用这些缓冲区填充缓冲区3。填充缓冲器3所涉及的数学运算无关紧要。在案例1中，从这些缓冲区分配的内存来自堆（malloc调用）。在案例2中，这些缓冲区的内存是通过OpenCLAPI调用（clCreateBuffer（））分配的。这两种情况之间存在性能差异。我在英特尔集成GPU上测试了它。我无法解释这种性能上的差异。它是否与CP

我编写了一个示例程序，以了解GPU/CPU固定内存和堆内存的影响。下面的代码说明了这一点。我已经分配了三个尺寸为1280x720的缓冲区。我用一些数据填充了缓冲区1和2，然后用这些缓冲区填充缓冲区3。填充缓冲器3所涉及的数学运算无关紧要。在案例1中，从这些缓冲区分配的内存来自堆（malloc调用）。在案例2中，这些缓冲区的内存是通过OpenCLAPI调用（clCreateBuffer（））分配的。这两种情况之间存在性能差异。我在英特尔集成GPU上测试了它。我无法解释这种性能上的差异。它是否与CPU/GPU固定内存与堆内存的可缓存属性有关

你以前有没有遇到过这样的行为，或者我做错了什么

#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>

#define OPENCL

#if defined(_WIN32)
/*
 * Win32 specific includes
 */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#else
#include <sys/time.h>

/* timersub is not provided by msys at this time. */
#ifndef timersub
#define timersub(a, b, result) \
    do { \
      (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
      (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
      if ((result)->tv_usec < 0) { \
        --(result)->tv_sec; \
        (result)->tv_usec += 1000000; \
      } \
    } while (0)
#endif
#endif


struct usec_timer {
#if defined(_WIN32)
  LARGE_INTEGER  begin, end;
#else
  struct timeval begin, end;
#endif
};


static void usec_timer_start(struct usec_timer *t) {
#if defined(_WIN32)
  QueryPerformanceCounter(&t->begin);
#else
  gettimeofday(&t->begin, NULL);
#endif
}


static void usec_timer_mark(struct usec_timer *t) {
#if defined(_WIN32)
  QueryPerformanceCounter(&t->end);
#else
  gettimeofday(&t->end, NULL);
#endif
}


static int64_t usec_timer_elapsed(struct usec_timer *t) {
#if defined(_WIN32)
  LARGE_INTEGER freq, diff;

  diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;

  QueryPerformanceFrequency(&freq);
  return diff.QuadPart * 1000000 / freq.QuadPart;
#else
  struct timeval diff;

  timersub(&t->end, &t->begin, &diff);
  return diff.tv_sec * 1000000 + diff.tv_usec;
#endif
}


#ifdef OPENCL
#include ".\CL\cl.h"

int opencl_init(cl_context *context, cl_command_queue *cmd_queue) {
  cl_int status;
  cl_uint num_platforms = 0;
  cl_platform_id platform;
  cl_uint num_devices = 0;
  cl_device_id device;
  cl_command_queue_properties command_queue_properties = 0;

  // Get the number of platforms in the system.
  status = clGetPlatformIDs(0, NULL, &num_platforms);
  if (status != CL_SUCCESS || num_platforms == 0)
    goto fail;

  // Get the platform ID for one platform
  status = clGetPlatformIDs(1, &platform, NULL);
  if (status != CL_SUCCESS)
    goto fail;

  // Get the number of devices available on the platform
  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
  if (status != CL_SUCCESS || num_devices == 0)
    goto fail;

  // Get the device ID for one device
  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  if (status != CL_SUCCESS)
    goto fail;

  // Create OpenCL context for one device
  *context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  if (status != CL_SUCCESS || *context == NULL)
    goto fail;

  // Create command queues for the device
  *cmd_queue = clCreateCommandQueue(*context, device, command_queue_properties, &status);
  if (status != CL_SUCCESS || *cmd_queue == NULL)
    goto fail;
  return 0;

fail:
  return 1;
}
#endif

int main(int argc, char **argv) {
  int x, y, z;
  int width = 1280, height = 720;
  unsigned char *buffer[3];
  int use_gpu;
  cl_mem opencl_mem[3];
  cl_context context;
  cl_command_queue cmd_queue;
  cl_int status;

  if (argc != 2)
    return 0;

  use_gpu = atoi(argv[1]);

  if (use_gpu) {
    if (opencl_init(&context, &cmd_queue))
      printf("OpenCL init failure");
  }

  if (use_gpu) {
    for (x = 0; x < 3; x++) {
      opencl_mem[x] = clCreateBuffer(context,
                                     CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                     width * height * sizeof(*buffer[x]), NULL,
                                     &status);
      if (status != CL_SUCCESS)
        return 0;
      buffer[x] = clEnqueueMapBuffer(cmd_queue, opencl_mem[x], CL_TRUE,
                                     CL_MAP_READ | CL_MAP_WRITE, 0,
                                     width * height * sizeof(*buffer[x]), 0,
                                     NULL, NULL, &status);
      if (status != CL_SUCCESS) {
        clReleaseMemObject(opencl_mem[x]);
        opencl_mem[x] = NULL;
        return 0;
      }
    }
  } else {
    for (x = 0; x < 3; x++) {
      buffer[x] = malloc(width * height * sizeof(*buffer[x]));
      if (buffer[x] == NULL) {
        printf("Unable to alloc memory");
      }
    }
  }

  memset(buffer[0], 1, width * height * sizeof(*buffer[0]));
  memset(buffer[1], 2, width * height * sizeof(*buffer[1]));
  memset(buffer[2], 0, width * height * sizeof(*buffer[2]));

  {
    struct usec_timer emr_timer;
    usec_timer_start(&emr_timer);
    for (z = 0; z < 600; z++) {
      for (y = 0; y < height; y++) {
        for (x = 0; x < width; x++) {
          // don't worry about overflows
          buffer[2][y * width + x] += buffer[0][y * width + x]
                                     + buffer[1][y * width + x];
        }
      }
    }
    usec_timer_mark(&emr_timer);
    printf("Elapsed time %"PRIu64"\n", usec_timer_elapsed(&emr_timer));
  }

  if (use_gpu) {
    for (x = 0; x < 3; x++) {
      if (buffer[x] != NULL) {
        status = clEnqueueUnmapMemObject(cmd_queue, opencl_mem[0], buffer[0], 0,
                                         NULL, NULL);
        status |= clFinish(cmd_queue);
        if (status != CL_SUCCESS)
          return 0;
        buffer[0] = NULL;
      }

      if (opencl_mem[0] != NULL) {
        status = clReleaseMemObject(opencl_mem[0]);
        if (status != CL_SUCCESS)
          return 0;
        opencl_mem[0] = NULL;
      }
    }

    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
  } else {
    for (x = 0; x < 3; x++) {
      free(buffer[x]);
      buffer[x] = NULL;
    }
  }
  return 0;
}

#包括
#包括
#包括
#包括
#包括
#定义OPENCL
#如果已定义（_WIN32）
/*
*Win32特定的包括
*/
#如果NDEF WIN32_LEAN_和_MEAN
#定义WIN32_精益_和_平均值
#恩迪夫
#包括
#否则
#包括
/*msys目前未提供timersub*/
#ifndef timersub
#定义timersub（a、b、结果）\
做{\
（结果）->tv_-sec=（a）->tv_-sec-（b）->tv_-sec\
（结果）->tv_usec=（a）->tv_usec-（b）->tv_usec\
如果（（结果）->tv_usec<0）{\
--（结果）->tv_秒\
（结果）->tv_usec+=1000000\
} \
}而（0）
#恩迪夫
#恩迪夫
结构usec_计时器{
#如果已定义（_WIN32）
大整数开始、结束；
#否则
结构timeval开始、结束；
#恩迪夫
};
静态无效usec\U计时器\U启动（结构usec\U计时器*t）{
#如果已定义（_WIN32）
查询性能计数器（&t->开始）；
#否则
gettimeofday（&t->开始，空）；
#恩迪夫
}
静态无效usec\U计时器\U标记（结构usec\U计时器*t）{
#如果已定义（_WIN32）
QueryPerformanceCounter（&t->end）；
#否则
gettimeofday（&t->结束，空）；
#恩迪夫
}
静态int64_t usec_timer_已过（struct usec_timer*t）{
#如果已定义（_WIN32）
大整数频率差；
diff.QuadPart=t->end.QuadPart-t->begin.QuadPart；
QueryPerformanceFrequency（&freq）；
返回差分四部分*1000000/频率四部分；
#否则
结构时间差；
timersub（&t->end，&t->begin，&diff）；
返回diff.tv_sec*1000000+diff.tv_usec；
#恩迪夫
}
#ifdef OPENCL
#包括“\CL\CL.h”
int opencl_init（cl_上下文*上下文，cl_命令队列*命令队列）{
国际地位；
cl_uint num_平台=0；
cl_平台\u id平台；
设备数量=0；
cl_设备\u id设备；
cl_命令_队列_属性命令_队列_属性=0；
//获取系统中平台的数量。
状态=clGetPlatformIDs（0、NULL和num_平台）；
如果（状态！=CL|u成功| num|u平台==0）
走向失败；
//获取一个平台的平台ID
状态=clGetPlatformIDs（1，平台，空）；
如果（状态！=CL_成功）
走向失败；
//获取平台上可用的设备数量
状态=CLGetDeviceID（平台、CL\U设备类型\U GPU、0、NULL和num\U设备）；
如果（状态！=CL|u成功| num|u设备==0）
走向失败；
//获取一个设备的设备ID
状态=CLGetDeviceID（平台，CL\U设备类型\U GPU，1，&设备，空）；
如果（状态！=CL_成功）
走向失败；
//为一个设备创建OpenCL上下文
*context=clCreateContext（NULL、1和设备、NULL、NULL和状态）；
if（status！=CL|u SUCCESS |*context==NULL）
走向失败；
//为设备创建命令队列
*cmd_queue=clCreateCommandQueue（*上下文、设备、命令队列属性和状态）；
if（status！=CL|u SUCCESS |*cmd|u queue==NULL）
走向失败；
返回0；
失败：
返回1；
}
#恩迪夫
int main（int argc，字符**argv）{
int x，y，z；
内部宽度=1280，高度=720；
无符号字符*缓冲区[3]；
int使用gpu；
cl_mem opencl_mem[3]；
语境；
cl_命令_队列cmd_队列；
国际地位；
如果（argc！=2）
返回0；
使用gpu=atoi（argv[1]）；
如果（使用gpu）{
if（opencl_init（&context，&cmd_queue））
printf（“OpenCL初始化失败”）；
}
如果（使用gpu）{
对于（x=0；x<3；x++）{
opencl_mem[x]=clCreateBuffer（上下文，
CL_MEM_只读| CL_MEM_ALLOC_HOST|PTR，
宽度*高度*大小（*缓冲区[x]），空，
&地位）；
如果（状态！=CL_成功）
返回0；
缓冲区[x]=clEnqueueMapBuffer（cmd_队列，opencl_mem[x]，CL_TRUE，
CL_映射读取CL|u映射写入，0，
宽度*高度*大小（*缓冲区[x]），0，
空、空和状态）；
如果（状态！=CL_成功）{
clreleasemobject（opencl_mem[x]）；
opencl_mem[x]=NULL；
返回0；
}
}
}否则{
对于（x=0；x<3；x++）{
缓冲区[x]=malloc（宽度*高度*尺寸（*缓冲区[x]）；
if（缓冲区[x]==NULL）{
printf（“无法分配内存”）；
}
}
}
memset（缓冲区[0]，1，宽度*高度*大小（*缓冲区[0]）；
memset（缓冲区[1]，2，宽度*高度*大小（*缓冲区[1]）；
memset（缓冲区[2]，0，宽度*高度*大小（*缓冲区[2]）；
{
结构usec_定时器emr_定时器；
使用定时器启动（&emr定时器）；
对于（z=0；z<600；z++）{
对于（y=0；y<高度；y++）{
对于（x=0；x