Visual studio 2015 单独的.cl文件不起作用。错误MSB3722_Visual Studio 2015_Opencl

Visual studio 2015 单独的.cl文件不起作用。错误MSB3722

visual-studio-2015 opencl

Visual studio 2015 单独的.cl文件不起作用。错误MSB3722,visual-studio-2015,opencl,Visual Studio 2015,Opencl,在Visual Studio 2015中，我创建了一个“Windows代码构建项目”（针对CPU）。这个项目附带了我根本没有接触过的代码。它基本上做向量加法。但是，向量添加发生在Template.cl文件上。当我尝试编译此项目时，它会给我以下错误：错误MSB3722命令“C:\Program Files（x86）\Intel\OpenCL SDK\bin\x86\ioc32.exe”-cmd=build-input=“blahblah\user\visual studio 2015\Proje

在Visual Studio 2015中，我创建了一个“Windows代码构建项目”（针对CPU）。这个项目附带了我根本没有接触过的代码。它基本上做向量加法。但是，向量添加发生在Template.cl文件上。当我尝试编译此项目时，它会给我以下错误：

错误MSB3722命令“C:\Program Files（x86）\Intel\OpenCL SDK\bin\x86\ioc32.exe”-cmd=build-input=“blahblah\user\visual studio 2015\Projects\OpenCLProject3\OpenCLProject3\Template.cl”-output=“Debug\Template.out”-VS-device=CPU\u 2\u 0-simd=default-targetos=current-bo=”“已退出，代码为5。请验证您是否有足够的权限运行此命令。OpenCLProject3 C:\Program Files（x86）\MSBuild\Microsoft.Cpp\v4.0\V140\BuildCustomizations\IntelOpenCL.targets 98

但是如果我将内核复制到我的cpp文件中，并将其作为字符串，那么它就会执行。字符串如下所示：

const char* prog1 = "__kernel void Add(__global int* pA, __global int* pB, __global int* pC){const int x = get_global_id(0);const int y = get_global_id(1);const int width = get_global_size(0);const int id = y * width + x;pC[id] = pA[id] + pB[id];}"

另外，我不是从源文件读取，而是用&prog1调用函数CreateAndBuildProgram的地址

以下是Visual Studio项目树的结构：

--References
--External 
--Headers
--OpenCL
  --Template.cl
--Source Files
  --OpenCLProject3.cpp
  --utils.cpp

请注意，我已删除错误代码。如果您在visual studio 2015中生成代码生成项目，您将获得完全相同的代码和结构。

以下是主机代码（OpenCLProject3.cpp）

可能与此处的相同：

可能的解决方案是从项目中删除“.cl”文件。

您是否可以发布格式化代码以提高可读性？还不清楚错误的来源。当然。我会加上它给我的一切。嗯，至少cpp和cl文件是的！这是可行的，但这是一个奇怪的解决方案…可能是visual studio的错误？不是真正的错误。对于项目中的所有文件，VS需要知道在构建项目时如何处理它们。对于.cl文件，VS不知道如何处理它，因此出现错误。由于VS不需要做任何事情，所以解决方案是从项目中删除该文件。也可以通过一些项目设置指示VS在构建期间忽略此文件，但我没有尝试。

    #include <stdio.h>
    #include <stdlib.h>
    #include <tchar.h>
    #include <memory.h>
    #include <vector>

    #include "CL\cl.h"
    #include "utils.h"

    //for perf. counters
    #include <Windows.h>


    // Macros for OpenCL versions
    #define OPENCL_VERSION_1_2  1.2f
    #define OPENCL_VERSION_2_0  2.0f

    struct ocl_args_d_t
    {
        ocl_args_d_t();
        ~ocl_args_d_t();

        // Regular OpenCL objects:
        cl_context       context;           // hold the context handler
        cl_device_id     device;            // hold the selected device handler
        cl_command_queue commandQueue;      // hold the commands-queue handler
        cl_program       program;           // hold the program handler
        cl_kernel        kernel;            // hold the kernel handler
        float            platformVersion;   // hold the OpenCL platform version (default 1.2)
        float            deviceVersion;     // hold the OpenCL device version (default. 1.2)
        float            compilerVersion;   // hold the device OpenCL C version (default. 1.2)

        // Objects that are specific for algorithm implemented in this sample
        cl_mem           srcA;              // hold first source buffer
        cl_mem           srcB;              // hold second source buffer
        cl_mem           dstMem;            // hold destination buffer
    };

    ocl_args_d_t::ocl_args_d_t():
            context(NULL),
            device(NULL),
            commandQueue(NULL),
            program(NULL),
            kernel(NULL),
            platformVersion(OPENCL_VERSION_1_2),
            deviceVersion(OPENCL_VERSION_1_2),
            compilerVersion(OPENCL_VERSION_1_2),
            srcA(NULL),
            srcB(NULL),
            dstMem(NULL)
    {
    }

    ocl_args_d_t::~ocl_args_d_t()
    {
        cl_int err = CL_SUCCESS;

        if (kernel)
        {
            err = clReleaseKernel(kernel);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (program)
        {
            err = clReleaseProgram(program);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (srcA)
        {
            err = clReleaseMemObject(srcA);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (srcB)
        {
            err = clReleaseMemObject(srcB);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (dstMem)
        {
            err = clReleaseMemObject(dstMem);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (commandQueue)
        {
            err = clReleaseCommandQueue(commandQueue);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (device)
        {
            err = clReleaseDevice(device);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err));
            }
        }
        if (context)
        {
            err = clReleaseContext(context);
            if (CL_SUCCESS != err)
            {
                LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
            }
        }


    }


    bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform)
    {
        size_t stringLength = 0;
        cl_int err = CL_SUCCESS;
        bool match = false;

        // In order to read the platform's name, we first read the platform's name string length (param_value is NULL).
        // The value returned in stringLength
        err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err));
            return false;
        }

        // Now, that we know the platform's name string length, we can allocate enough space before read it
        std::vector<char> platformName(stringLength);

        // Read the platform's name string
        // The read value returned in platformName
        err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err));
            return false;
        }

        // Now check if the platform's name is the required one
        if (strstr(&platformName[0], preferredPlatform) != 0)
        {
            // The checked platform is the one we're looking for
            match = true;
        }

        return match;
    }

    cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType)
    {
        cl_uint numPlatforms = 0;
        cl_int err = CL_SUCCESS;

        // Get (in numPlatforms) the number of OpenCL platforms available
        // No platform ID will be return, since platforms is NULL
        err = clGetPlatformIDs(0, NULL, &numPlatforms);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err));
            return NULL;
        }
        LogInfo("Number of available platforms: %u\n", numPlatforms);

        if (0 == numPlatforms)
        {
            LogError("Error: No platforms found!\n");
            return NULL;
        }

        std::vector<cl_platform_id> platforms(numPlatforms);

        // Now, obtains a list of numPlatforms OpenCL platforms available
        // The list of platforms available will be returned in platforms
        err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err));
            return NULL;
        }

        // Check if one of the available platform matches the preferred requirements
        for (cl_uint i = 0; i < numPlatforms; i++)
        {
            bool match = true;
            cl_uint numDevices = 0;

            // If the preferredPlatform is not NULL then check if platforms[i] is the required one
            // Otherwise, continue the check with platforms[i]
            if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
            {
                // In case we're looking for a specific platform
                match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform);
            }

            // match is true if the platform's name is the required one or don't care (NULL)
            if (match)
            {
                // Obtains the number of deviceType devices available on platform
                // When the function failed we expect numDevices to be zero.
                // We ignore the function return value since a non-zero error code
                // could happen if this platform doesn't support the specified device type.
                err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
                if (CL_SUCCESS != err)
                {
                    LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err));
                }

                if (0 != numDevices)
                {
                    // There is at list one device that answer the requirements
                    return platforms[i];
                }
            }
        }

        return NULL;
    }


    /*
     * This function read the OpenCL platdorm and device versions
     * (using clGetxxxInfo API) and stores it in the ocl structure.
     * Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices
     * in the same program.
     */
    int GetPlatformAndDeviceVersion (cl_platform_id platformId, ocl_args_d_t *ocl)
    {
        cl_int err = CL_SUCCESS;

        // Read the platform's version string length (param_value is NULL).
        // The value returned in stringLength
        size_t stringLength = 0;
        err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
            return err;
        }

        // Now, that we know the platform's version string length, we can allocate enough space before read it
        std::vector<char> platformVersion(stringLength);

        // Read the platform's version string
        // The read value returned in platformVersion
        err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err));
            return err;
        }

        if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL)
        {
            ocl->platformVersion = OPENCL_VERSION_2_0;
        }

        // Read the device's version string length (param_value is NULL).
        err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
            return err;
        }

        // Now, that we know the device's version string length, we can allocate enough space before read it
        std::vector<char> deviceVersion(stringLength);

        // Read the device's version string
        // The read value returned in deviceVersion
        err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err));
            return err;
        }

        if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL)
        {
            ocl->deviceVersion = OPENCL_VERSION_2_0;
        }

        // Read the device's OpenCL C version string length (param_value is NULL).
        err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
            return err;
        }

        // Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it
        std::vector<char> compilerVersion(stringLength);

        // Read the device's OpenCL C version string
        // The read value returned in compilerVersion
        err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err));
            return err;
        }

        else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL)
        {
            ocl->compilerVersion = OPENCL_VERSION_2_0;
        }

        return err;
    }


    /*
     * Generate random value for input buffers
     */
    void generateInput(cl_int* inputArray, cl_uint arrayWidth, cl_uint arrayHeight)
    {
        srand(12345);

        // random initialization of input
        cl_uint array_size = arrayWidth * arrayHeight;
        for (cl_uint i = 0; i < array_size; ++i)
        {
            inputArray[i] = rand();
        }
    }

    int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
    {
        // The following variable stores return codes for all OpenCL calls.
        cl_int err = CL_SUCCESS;

        // Query for all available OpenCL platforms on the system
        // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
        cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
        if (NULL == platformId)
        {
            LogError("Error: Failed to find OpenCL platform.\n");
            return CL_INVALID_VALUE;
        }

        // Create context with device of specified type.
        // Required device type is passed as function argument deviceType.
        // So you may use this function to create context for any CPU or GPU OpenCL device.
        // The creation is synchronized (pfn_notify is NULL) and NULL user_data
        cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0};
        ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
        if ((CL_SUCCESS != err) || (NULL == ocl->context))
        {
            LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
            return err;
        }

        // Query for OpenCL device which was used for context creation
        err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
            return err;
        }

        // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
        GetPlatformAndDeviceVersion(platformId, ocl);

        // Create command queue.
        // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
        // Command queue guarantees some ordering between calls and other OpenCL commands.
        // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
    #ifdef CL_VERSION_2_0
        if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
        {
            const cl_command_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
            ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
        } 
        else {
            // default behavior: OpenCL 1.2
            cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
            ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
        } 
    #else
        // default behavior: OpenCL 1.2
        cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
        ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
    #endif
        if (CL_SUCCESS != err)
        {
            LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
            return err;
        }

        return CL_SUCCESS;
    }


    /* 
     * Create and build OpenCL program from its source code
     */
    int CreateAndBuildProgram(ocl_args_d_t *ocl)
    {
        cl_int err = CL_SUCCESS;

        // Upload the OpenCL C source code from the input file to source
        // The size of the C program is returned in sourceSize
        char* source = NULL;
        size_t src_size = 0;
        err = ReadSourceFromFile("Template.cl", &source, &src_size);
        if (CL_SUCCESS != err)
        {
            LogError("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err));
            goto Finish;
        }

        // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object.
        ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, &err);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err));
            goto Finish;
        }

        // Build the program
        // During creation a program is not built. You need to explicitly call build function.
        // Here you just use create-build sequence,
        // but there are also other possibilities when program consist of several parts,
        // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as
        // alternatives.
        err = clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));

            // In case of error print the build log to the standard output
            // First check the size of the log
            // Then allocate the memory and obtain the log from the program
            if (err == CL_BUILD_PROGRAM_FAILURE)
            {
                size_t log_size = 0;
                clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

                std::vector<char> build_log(log_size);
                clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);

                LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
            }
        }

    Finish:
        if (source)
        {
            delete[] source;
            source = NULL;
        }

        return err;
    }


    int CreateBufferArguments(ocl_args_d_t *ocl, cl_int* inputA, cl_int* inputB, cl_int* outputC, cl_uint arrayWidth, cl_uint arrayHeight)
    {
        cl_int err = CL_SUCCESS;

        // Create new OpenCL buffer objects
        // As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY.
        // Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime
        // to better organize data copying.
        // You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB.

        ocl->srcA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputA, &err);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clCreateBuffer for srcA returned %s\n", TranslateOpenCLError(err));
            return err;
        }

        ocl->srcB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputB, &err);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clCreateBuffer for srcB returned %s\n", TranslateOpenCLError(err));
            return err;
        }

        // If the output buffer is created directly on top of output buffer using CL_MEM_USE_HOST_PTR,
        // then, depending on the OpenCL runtime implementation and hardware capabilities, 
        // it may save you not necessary data copying.
        // As it is known that output buffer will be write only, you explicitly declare it using CL_MEM_WRITE_ONLY.
        ocl->dstMem = clCreateBuffer(ocl->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, outputC, &err);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clCreateBuffer for dstMem returned %s\n", TranslateOpenCLError(err));
            return err;
        }


        return CL_SUCCESS;
    }


    cl_uint SetKernelArguments(ocl_args_d_t *ocl)
    {
        cl_int err = CL_SUCCESS;

        err  =  clSetKernelArg(ocl->kernel, 0, sizeof(cl_mem), (void *)&ocl->srcA);
        if (CL_SUCCESS != err)
        {
            LogError("error: Failed to set argument srcA, returned %s\n", TranslateOpenCLError(err));
            return err;
        }

        err  = clSetKernelArg(ocl->kernel, 1, sizeof(cl_mem), (void *)&ocl->srcB);
        if (CL_SUCCESS != err)
        {
            LogError("Error: Failed to set argument srcB, returned %s\n", TranslateOpenCLError(err));
            return err;
        }

        err  = clSetKernelArg(ocl->kernel, 2, sizeof(cl_mem), (void *)&ocl->dstMem);
        if (CL_SUCCESS != err)
        {
            LogError("Error: Failed to set argument dstMem, returned %s\n", TranslateOpenCLError(err));
            return err;
        }

        return err;
    }


    /*
     * Execute the kernel
     */
    cl_uint ExecuteAddKernel(ocl_args_d_t *ocl, cl_uint width, cl_uint height)
    {
        cl_int err = CL_SUCCESS;

        // Define global iteration space for clEnqueueNDRangeKernel.
        size_t globalWorkSize[2] = {width, height};


        // execute kernel
        err = clEnqueueNDRangeKernel(ocl->commandQueue, ocl->kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: Failed to run kernel, return %s\n", TranslateOpenCLError(err));
            return err;
        }

        // Wait until the queued kernel is completed by the device
        err = clFinish(ocl->commandQueue);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clFinish return %s\n", TranslateOpenCLError(err));
            return err;
        }

        return CL_SUCCESS;
    }


    /*
     * "Read" the result buffer (mapping the buffer to the host memory address)
     */
    bool ReadAndVerify(ocl_args_d_t *ocl, cl_uint width, cl_uint height, cl_int *inputA, cl_int *inputB)
    {
        cl_int err = CL_SUCCESS;
        bool result = true;

        // Enqueue a command to map the buffer object (ocl->dstMem) into the host address space and returns a pointer to it
        // The map operation is blocking
        cl_int *resultPtr = (cl_int *)clEnqueueMapBuffer(ocl->commandQueue, ocl->dstMem, true, CL_MAP_READ, 0, sizeof(cl_uint) * width * height, 0, NULL, NULL, &err);

        if (CL_SUCCESS != err)
        {
            LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
            return false;
        }

        // Call clFinish to guarantee that output region is updated
        err = clFinish(ocl->commandQueue);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
        }

        // We mapped dstMem to resultPtr, so resultPtr is ready and includes the kernel output !!!
        // Verify the results
        unsigned int size = width * height;
        for (unsigned int k = 0; k < size; ++k)
        {
            if (resultPtr[k] != inputA[k] + inputB[k])
            {
                LogError("Verification failed at %d: (%d + %d = %d)\n", k, inputA[k], inputB[k], resultPtr[k]);
                result = false;
            }
        }

         // Unmapped the output buffer before releasing it
        err = clEnqueueUnmapMemObject(ocl->commandQueue, ocl->dstMem, resultPtr, 0, NULL, NULL);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clEnqueueUnmapMemObject returned %s\n", TranslateOpenCLError(err));
        }

        return result;
    }


    /*
     * main execution routine
     * Basically it consists of three parts:
     *   - generating the inputs
     *   - running OpenCL kernel
     *   - reading results of processing
     */
    int _tmain(int argc, TCHAR* argv[])
    {
        cl_int err;
        ocl_args_d_t ocl;
        cl_device_type deviceType = CL_DEVICE_TYPE_CPU;

        LARGE_INTEGER perfFrequency;
        LARGE_INTEGER performanceCountNDRangeStart;
        LARGE_INTEGER performanceCountNDRangeStop;

        cl_uint arrayWidth  = 1024;
        cl_uint arrayHeight = 1024;

        //initialize Open CL objects (context, queue, etc.)
        if (CL_SUCCESS != SetupOpenCL(&ocl, deviceType))
        {
            return -1;
        }

        // allocate working buffers. 
        // the buffer should be aligned with 4K page and size should fit 64-byte cached line
        cl_uint optimizedSize = ((sizeof(cl_int) * arrayWidth * arrayHeight - 1)/64 + 1) * 64;
        cl_int* inputA  = (cl_int*)_aligned_malloc(optimizedSize, 4096);
        cl_int* inputB  = (cl_int*)_aligned_malloc(optimizedSize, 4096);
        cl_int* outputC = (cl_int*)_aligned_malloc(optimizedSize, 4096);
        if (NULL == inputA || NULL == inputB || NULL == outputC)
        {
            LogError("Error: _aligned_malloc failed to allocate buffers.\n");
            return -1;
        }

        //random input
        generateInput(inputA, arrayWidth, arrayHeight);
        generateInput(inputB, arrayWidth, arrayHeight);

        // Create OpenCL buffers from host memory
        // These buffers will be used later by the OpenCL kernel
        if (CL_SUCCESS != CreateBufferArguments(&ocl, inputA, inputB, outputC, arrayWidth, arrayHeight))
        {
            return -1;
        }

         // Create and build the OpenCL program
        if (CL_SUCCESS != CreateAndBuildProgram(&ocl))
        {
            return -1;
        }

        // Program consists of kernels.
        // Each kernel can be called (enqueued) from the host part of OpenCL application.
        // To call the kernel, you need to create it from existing program.
        ocl.kernel = clCreateKernel(ocl.program, "Add", &err);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clCreateKernel returned %s\n", TranslateOpenCLError(err));
            return -1;
        }

        // Passing arguments into OpenCL kernel.
        if (CL_SUCCESS != SetKernelArguments(&ocl))
        {
            return -1;
        }

        // Regularly you wish to use OpenCL in your application to achieve greater performance results
        // that are hard to achieve in other ways.
        // To understand those performance benefits you may want to measure time your application spent in OpenCL kernel execution.
        // The recommended way to obtain this time is to measure interval between two moments:
        //   - just before clEnqueueNDRangeKernel is called, and
        //   - just after clFinish is called
        // clFinish is necessary to measure entire time spending in the kernel, measuring just clEnqueueNDRangeKernel is not enough,
        // because this call doesn't guarantees that kernel is finished.
        // clEnqueueNDRangeKernel is just enqueue new command in OpenCL command queue and doesn't wait until it ends.
        // clFinish waits until all commands in command queue are finished, that suits your need to measure time.
        bool queueProfilingEnable = true;
        if (queueProfilingEnable)
            QueryPerformanceCounter(&performanceCountNDRangeStart);
        // Execute (enqueue) the kernel
        if (CL_SUCCESS != ExecuteAddKernel(&ocl, arrayWidth, arrayHeight))
        {
            return -1;
        }
        if (queueProfilingEnable)
            QueryPerformanceCounter(&performanceCountNDRangeStop);

        // The last part of this function: getting processed results back.
        // use map-unmap sequence to update original memory area with output buffer.
        ReadAndVerify(&ocl, arrayWidth, arrayHeight, inputA, inputB);

        // retrieve performance counter frequency
        if (queueProfilingEnable)
        {
            QueryPerformanceFrequency(&perfFrequency);
            LogInfo("NDRange performance counter time %f ms.\n",
                1000.0f*(float)(performanceCountNDRangeStop.QuadPart - performanceCountNDRangeStart.QuadPart) / (float)perfFrequency.QuadPart);
        }

        _aligned_free(inputA);
        _aligned_free(inputB);
        _aligned_free(outputC);

        return 0;
    }

__kernel void Add(__global int* pA, __global int* pB, __global int* pC)
{
    const int x     = get_global_id(0);
    const int y     = get_global_id(1);
    const int width = get_global_size(0);

    const int id = y * width + x;

    pC[id] = pA[id] + pB[id];
}