Cuda 优化计算进化传感器im2col 要求
我写信是为了寻求优化我的解决方案/方法“CalculateConvolutionOutputTensor\uuu im2col”的指导。我想帮助确定超越我幼稚做法的最佳策略;提供关于任何相关GPU流程及其应用方式的直觉(例如,银行冲突);并帮助解释上述配置文件中我可以调整的内容 使用GeForce 2080 Ti首次运行该方法需要0.774秒。我已经包含了我所写的唯一的CUDA C++内核的NVIEW计算配置文件的屏幕截图:IM2COLL。 我能做的事 我可以让每个GPU线程访问共享内存而不是全局内存。我可以将GPU“heap”变量转移到内核“stack”中,而不是对每个线程和内核中的循环迭代进行解引用。我可以将小参数放入GPU内存中的数组中,并向这些数组传递单指针。我可以使用更复杂的im2col版本 我尝试过的事情 我不希望使用cuDNN 7.6.5;当我使用cudnn7.6.5并编写语句“cudncreate(&cudnnHandle);”时,Nsight Compute建议方法cuModuleGetFunction返回CUDA_ERROR_NOT_FOUND 再生溶液 我用于创建此项目的过程是使用Visual Studio Community 2019创建一个新的CUDA 10.2运行时项目,将默认源文件重命名为“main.cu”,用下面的第一个代码块替换所有内容,将“CalculateConversionOutPuttensor__im2col.h”添加到我的项目中,添加下面的第二个代码块,添加“CalculateConvolutionOutputTensor__im2col.cu”到我的项目中,添加下面的第三个代码块,并添加“cublas.lib”;“要投影属性->链接器->输入->其他依赖项 main.cu 计算进化输出传感器im2col.cuCuda 优化计算进化传感器im2col 要求,cuda,conv-neural-network,convolution,nsight-compute,Cuda,Conv Neural Network,Convolution,Nsight Compute,我写信是为了寻求优化我的解决方案/方法“CalculateConvolutionOutputTensor\uuu im2col”的指导。我想帮助确定超越我幼稚做法的最佳策略;提供关于任何相关GPU流程及其应用方式的直觉(例如,银行冲突);并帮助解释上述配置文件中我可以调整的内容 使用GeForce 2080 Ti首次运行该方法需要0.774秒。我已经包含了我所写的唯一的CUDA C++内核的NVIEW计算配置文件的屏幕截图:IM2COLL。 我能做的事 我可以让每个GPU线程访问共享内存而不是
//允许使用uuu全局uuu。
#包括
//允许声明cublasHandle。
#包括“cublas_v2.h”
//允许使用blockIdx.x、blockDim.x和threadIdx.x。
#包括
__全球的__
空二甲醇(
浮动*col_child,
int*channelsInFilter\u子级,
int*widthOfffilter\u子对象,
int*输出传感器的高度,
int*输出传感器的宽度,
int*元件输出传感器的通道,
int*通道过滤\次\元素输出传感器的通道,
int*ELEMENTS滤波器的横截面乘以OUTPUTENSOR子元件的通道中的元件,
浮动*输入传感器\u子对象\u子对象,
int*horizontalFilterStride_儿童,
int*channelsInImage_child,
int*verticalFilterStride_子级,
int*元素在图像子对象的横截面中,
int*image\u子对象,
int*elementsInImage_child);
空隙计算进化输出传感器(
浮动*卷积输出传感器,
int elementsInFilter_child,
输出传感器、主机和子设备通道中的int元素,
int imagesinu child,
int*channelsInFilter,
int*过滤器的宽度,
int*输出传感器的高度,
int*输出传感器的宽度,
int*元件输出传感器的通道,
int*通道过滤器\乘以\元件输出传感器通道,
int*元件在输出传感器通道中滤波器的横截面乘以元件,
浮动*输入传感器\u子级,
int*水平过滤栅,
int*channelsInImage,
int*verticalFilterStride,
int*元素在图像的横截面上,
int*elementsInImage,
int filters_child,
浮子*过滤器传感器,
int ELEMENTSINOUTPUTSUBENSOR(子元件)
{
// -----------------------------------------
//定义和声明im2col的参数。
// -----------------------------------------
//定义im2col执行配置的参数。
对于im2col=885,每个块的int线程数;
整数块\u用于\u im2col=
(elementsInFilter_child+线程每_block_用于_im2col-1)/线程每_block_用于_im2col;
//申报。
浮点数*col;
//需要以下声明才能
//防止将产品自动强制转换为8字节整数。
int ELEMENT SINFILTER\U times\U ELEMENT SINFOUTPUT传感器通道=
元件SINFILTER_child*元件SINFALONOUTPUTENSOR_host_child的通道;
Cudamaloc(&col,滤芯过滤乘以滤芯过滤通道输出传感器*尺寸(浮动));
// -----------------------------------------------------------------------------
//定义用于计算滤波器传感器和col矩阵乘积的参数。
// -----------------------------------------------------------------------------
//定义一个名为cublasHandle的cublasHandle\u t对象。
//声明cublasHandle需要“#包括“cublas_v2.h””。
//定义cublasHandle需要将“cublas.lib”添加到
//属性->链接器->输入->其他依赖项。
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
//定义(不包括)的参数
//乘积滤波器传感器和列中第三矩阵的一部分。
浮点数1=1.0;
浮动零点=0.0;
// ------------------------------------------------------------
//对于细分中的每个图像,
//将图像雕刻成矩阵列。
//计算滤波器传感器和col的矩阵乘积,并
//将产品作为卷积传感器的子传感器存储。
// ------------------------------------------------------------
//需要以下声明才能
//防止将产品自动强制转换为8字节整数。
int image_times_element输入输出子传感器;
int*图像处理器;
Cudamaloc(和图像处理器,sizeof(int));
对于(int image\u host=0;image\u host// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of time(NULL) as a seed.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int channelsInFilter_host = 3;
int* channelsInFilter_GPU;
cudaMalloc(&channelsInFilter_GPU, sizeof(int));
cudaMemcpy(channelsInFilter_GPU, &channelsInFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfFilter_host = 590;
int* widthOfFilter_GPU;
cudaMalloc(&widthOfFilter_GPU, sizeof(int));
cudaMemcpy(widthOfFilter_GPU, &widthOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfOutputTensor_host = 19;
int* heightOfOutputTensor_GPU;
cudaMalloc(&heightOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(heightOfOutputTensor_GPU, &heightOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int elementsInChannelOfOutputTensor_host = 19 * 19;
int* elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInChannelOfOutputTensor_GPU,
&elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInFilter_times_elementsInChannelOfOutputTensor_host = 3 * 19 * 19;
int* channelsInFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&channelsInFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
&channelsInFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host = 3 * 590 * 19 * 19;
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int horizontalFilterStride_host = 1;
int* horizontalFilterStride_GPU;
cudaMalloc(&horizontalFilterStride_GPU, sizeof(int));
cudaMemcpy(
horizontalFilterStride_GPU,
&horizontalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int verticalFilterStride_host = 1;
int* verticalFilterStride_GPU;
cudaMalloc(&verticalFilterStride_GPU, sizeof(int));
cudaMemcpy(
verticalFilterStride_GPU,
&verticalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfImage_host = 3 * 608;
int* elementsInCrossSectionOfImage_GPU;
cudaMalloc(&elementsInCrossSectionOfImage_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfImage_GPU,
&elementsInCrossSectionOfImage_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInImage_host = 3 * 608 * 608;
int* elementsInImage_GPU;
cudaMalloc(&elementsInImage_GPU, sizeof(int));
cudaMemcpy(elementsInImage_GPU, &elementsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor_host,
imagesInSubdivision,
channelsInFilter_GPU,
widthOfFilter_GPU,
heightOfOutputTensor_GPU,
widthOfOutputTensor_GPU,
elementsInChannelOfOutputTensor_GPU,
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
inputTensor_GPU,
horizontalFilterStride_GPU,
channelsInImage_GPU,
verticalFilterStride_GPU,
elementsInCrossSectionOfImage_GPU,
elementsInImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(channelsInFilter_GPU);
cudaFree(widthOfFilter_GPU);
cudaFree(heightOfOutputTensor_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(elementsInChannelOfOutputTensor_GPU);
cudaFree(channelsInFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(inputTensor_GPU);
cudaFree(horizontalFilterStride_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(verticalFilterStride_GPU);
cudaFree(elementsInCrossSectionOfImage_GPU);
cudaFree(elementsInImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image_child,
int* elementsInImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 885;
int blocks_for_im2col =
(elementsInFilter_child + threads_per_block_for_im2col - 1) / threads_per_block_for_im2col;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col<<<blocks_for_im2col, threads_per_block_for_im2col>>>
(col,
channelsInFilter,
widthOfFilter,
heightOfOutputTensor,
widthOfOutputTensor,
elementsInChannelOfOutputTensor_GPU_child,
channelsInFilter_times_elementsInChannelOfOutputTensor,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
inputTensor_child,
horizontalFilterStride,
channelsInImage,
verticalFilterStride,
elementsInCrossSectionOfImage,
image_GPU,
elementsInImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
elementsInChannelOfOutputTensor_host_child,
filters_child,
elementsInFilter_child,
&one,
col,
elementsInChannelOfOutputTensor_host_child,
filterTensor,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
elementsInChannelOfOutputTensor_host_child);
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image,
int* elementsInImage_child)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int c_prime = index % (*channelsInFilter_child);
int temp = (index - c_prime) / (*channelsInFilter_child);
int w_prime = temp % (*widthOfFilter_child);
int h_prime = temp / (*widthOfFilter_child);
for (int h = 0; h < (*heightOfOutputTensor_child); ++h) {
for (int w = 0; w < (*widthOfOutputTensor_child); ++w) {
col_child[
w +
h * (*widthOfOutputTensor_child) +
c_prime * (*elementsInChannelOfOutputTensor_child) +
w_prime * (*channelsInFilter_times_elementsInChannelOfOutputTensor_child) +
h_prime * (*elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child)] =
inputTensor_child_child[
c_prime +
(w * (*horizontalFilterStride_child) + w_prime) * (*channelsInImage_child) +
(h * (*verticalFilterStride_child) + h_prime) * (*elementsInCrossSectionOfImage_child) +
(*image) * (*elementsInImage_child)];
}
}
}
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of structs in namespace chrono.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int heightOfFilter_host = 590;
int* heightOfFilter_GPU;
cudaMalloc(&heightOfFilter_GPU, sizeof(int));
cudaMemcpy(heightOfFilter_GPU, &heightOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfImage_host = 608;
int* widthOfImage_GPU;
cudaMalloc(&widthOfImage_GPU, sizeof(int));
cudaMemcpy(widthOfImage_GPU, &widthOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfImage_host = 608;
int* heightOfImage_GPU;
cudaMalloc(&heightOfImage_GPU, sizeof(int));
cudaMemcpy(heightOfImage_GPU, &heightOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor,
imagesInSubdivision,
inputTensor_GPU,
heightOfFilter_GPU,
channelsInImage_GPU,
widthOfImage_GPU,
widthOfOutputTensor_GPU,
heightOfImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(inputTensor_GPU);
cudaFree(heightOfFilter_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(widthOfImage_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(heightOfImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 3 * 590 / 2;
int blocks_for_im2col = 2 * 590 * 19 * 19;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col
<<<blocks_for_im2col,
threads_per_block_for_im2col>>>
(col,
inputTensor_child,
heightOfFilter,
channelsInImage,
widthOfImage,
widthOfOutputTensor,
image_GPU,
heightOfImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
filters_child,
elementsInChannelOfOutputTensor_host_child,
elementsInFilter_child,
&one,
filterTensor,
filters_child,
col,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
filters_child);
float element = 0.0;
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child)
{
col_child[blockIdx.x * blockDim.x + threadIdx.x] =
inputTensor_child_child[
threadIdx.x +
(blockIdx.x % 2) * blockDim.x +
((blockIdx.x % (2 * (*heightOfFilter_child))) / 2) * (*channelsInImage_child) * (*widthOfImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child))) * (*channelsInImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child) * (*widthOfOutputTensor_child))) * (*channelsInImage_child) * (*widthOfImage_child) +
(*image) * (*channelsInImage_child) * (*widthOfImage_child) * (*heightOfImage_child)];
}