CUDA中的乘法矩形矩阵
在本作业中,我需要使用CUDA C完成两个矩形矩阵相乘的代码。完成代码后,我提交了解决方案,当矩阵为正方形时,数据集的解决方案是正确的,而当矩阵不为正方形时,结果与预期值不符 以下是我添加缺失部分后的代码:CUDA中的乘法矩形矩阵,cuda,Cuda,在本作业中,我需要使用CUDA C完成两个矩形矩阵相乘的代码。完成代码后,我提交了解决方案,当矩阵为正方形时,数据集的解决方案是正确的,而当矩阵不为正方形时,结果与预期值不符 以下是我添加缺失部分后的代码: #include <wb.h> #define wbCheck(stmt) do { \ cudaError_t err = stmt; \ if
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return ;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
C[Row*numAColumns + Col] = Cvalue;
}
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 DimGrid(numARows / 8 , numBColumns / 8, 1);
dim3 DimBlock(8 , 8, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
#包括
#定义wbCheck(stmt)do{\
cudaError\u t err=stmt\
如果(err!=cudaSuccess){\
wbLog(错误,“无法运行stmt”,#stmt)\
返回-1\
} \
}而(0)
//计算C=A*B
__全局无效矩阵倍数(浮点*A、浮点*B、浮点*C、,
内特纽马罗,内特纽马科伦,
int numberrows,int numbercolumns,
整数行,整数列){
//@@在此处插入实现矩阵乘法的代码
int Row=blockIdx.y*blockDim.y+threadIdx.y;
int Col=blockIdx.x*blockDim.x+threadIdx.x;
if(numColumns!=numBRows)返回;
如果((行
我希望你能帮我找出哪个部分不正确 替换:
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
与
替换:
for(int k=0;k
与
for(int k=0;k
更换
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
为了
及
为了
在Ira、Ahmad、ram和Oli Fly的帮助下,我得到了如下正确答案:
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0; k < numAColumns; ++k)
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
}
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 DimGrid((numCColumns - 1) / 8 + 1, (numCRows - 1) / 8 + 1, 1);
dim3 DimBlock(8 , 8, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
#包括
#定义wbCheck(stmt)do{\
cudaError\u t err=stmt\
如果(err!=cudaSuccess){\
wbLog(错误,“无法运行stmt”,#stmt)\
返回-1\
} \
}而(0)
//计算C=A*B
__全局无效矩阵倍数(浮点*A、浮点*B、浮点*C、,
内特纽马罗,内特纽马科伦,
int numberrows,int numbercolumns,
整数行,整数列){
//@@在此处插入实现矩阵乘法的代码
int Row=blockIdx.y*blockDim.y+threadIdx.y;
int Col=blockIdx.x*blockDim.x+threadIdx.x;
if(numColumns!=numBRows)返回;
如果((行Cvalue += A[Row*numAColumns +k]* B[k*numBColumns+Col];
C[Row*numAColumns + Col] = Cvalue;
C[Row*numCColumns+Col] = Cvalue;
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0; k < numAColumns; ++k)
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
}
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 DimGrid((numCColumns - 1) / 8 + 1, (numCRows - 1) / 8 + 1, 1);
dim3 DimBlock(8 , 8, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
//@@ You have to use shared memory for this MP
const int TILE_WIDTH = 32;
__shared__ float sharedA[TILE_WIDTH][TILE_WIDTH];
__shared__ float sharedB[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Row = by*TILE_WIDTH + ty;
int Col = bx*TILE_WIDTH + tx;
float Cvalue = 0.0;
if (numAColumns != numBRows) return ;
for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)
{
if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){
sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx];
}else{
sharedA[ty][tx] = 0.0;
}
if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){
sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col];
}else{
sharedB[ty][tx] = 0.0;
}
__syncthreads();
if(Row < numARows && Col < numBColumns){
for(int j = 0; j < TILE_WIDTH; j++)
Cvalue += sharedA[ty][j] * sharedB[j][tx];
}
__syncthreads();
}
if (Row < numCRows && Col < numCColumns)
C[Row*numCColumns + Col] = Cvalue;
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
int TILE_WIDTH = 32;
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
int dimX = (int)(ceil((float)numCColumns / TILE_WIDTH));
int dimY = (int)(ceil((float)numCRows / TILE_WIDTH));
dim3 DimGrid(dimX, dimY);
dim3 DimBlock(TILE_WIDTH, TILE_WIDTH);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}