Cuda '的意义是什么;sharedMemBytes';内核调用中的参数是否为内核()?
我试图在JCuda中使用共享内存实现简单的矩阵乘法程序 以下是我的JCudaSharedMatrixMul.java代码:Cuda '的意义是什么;sharedMemBytes';内核调用中的参数是否为内核()?,cuda,jcuda,Cuda,Jcuda,我试图在JCuda中使用共享内存实现简单的矩阵乘法程序 以下是我的JCudaSharedMatrixMul.java代码: import static jcuda.driver.JCudaDriver.cuCtxCreate; import static jcuda.driver.JCudaDriver.cuCtxSynchronize; import static jcuda.driver.JCudaDriver.cuDeviceGet; import static jcuda.driver.
import static jcuda.driver.JCudaDriver.cuCtxCreate;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuDeviceGet;
import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleLoad;
import static jcuda.runtime.JCuda.cudaEventCreate;
import static jcuda.runtime.JCuda.cudaEventRecord;
import static jcuda.runtime.JCuda.*;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;
import jcuda.runtime.cudaEvent_t;
public class JCudaSharedMatrixMul
{
public static void main(String[] args) throws IOException
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Create the PTX file by calling the NVCC
String ptxFilename = preparePtxFile("JCudaSharedMatrixMulKernel.cu");
//Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet (device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
//Load PTX file
CUmodule module = new CUmodule();
cuModuleLoad(module,ptxFilename);
//Obtain a function pointer to the Add function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "jCudaSharedMatrixMulKernel");
int numRows = 16;
int numCols = 16;
//Allocate and fill Host input Matrices:
float hostMatrixA[] = new float[numRows*numCols];
float hostMatrixB[] = new float[numRows*numCols];
float hostMatrixC[] = new float[numRows*numCols];
for(int i = 0; i<numRows; i++)
{
for(int j = 0; j<numCols; j++)
{
hostMatrixA[i*numCols+j] = (float) 1;
hostMatrixB[i*numCols+j] = (float) 1;
}
}
// Allocate the device input data, and copy the
// host input data to the device
CUdeviceptr devMatrixA = new CUdeviceptr();
cuMemAlloc(devMatrixA, numRows * numCols * Sizeof.FLOAT);
//This is the part where it gives me the error
cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
CUdeviceptr devMatrixB = new CUdeviceptr();
cuMemAlloc(devMatrixB, numRows * numCols * Sizeof.FLOAT);
//This is the part where it gives me the error
cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixB ), numRows * numCols * Sizeof.FLOAT);
//Allocate device matrix C to store output
CUdeviceptr devMatrixC = new CUdeviceptr();
cuMemAlloc(devMatrixC, numRows * numCols * Sizeof.FLOAT);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{numCols}),
Pointer.to(devMatrixA),
Pointer.to(devMatrixB),
Pointer.to(devMatrixC));
//Kernel thread configuration
int blockSize = 16;
int gridSize = 1;
cudaEvent_t start = new cudaEvent_t();
cudaEvent_t stop = new cudaEvent_t();
cudaEventCreate(start);
cudaEventCreate(stop);
long start_nano=System.nanoTime();
cudaEventRecord(start, null);
cuLaunchKernel(function,
gridSize, 1, 1,
blockSize, 16, 1,
250, null, kernelParameters, null);
cuCtxSynchronize();
cudaEventRecord(stop, null);
long end_nano=System.nanoTime();
float elapsedTimeMsArray[] = { Float.NaN };
cudaEventElapsedTime(elapsedTimeMsArray, start, stop);
float elapsedTimeMs = elapsedTimeMsArray[0];
System.out.println("Time Required (Using cudaevent elapsed time) = " + " " +elapsedTimeMs+
"Time Required (Using nanotime)= "+(end_nano-start_nano)/1000000);
// Allocate host output memory and copy the device output
// to the host.
//This is the part where it gives me the error
cuMemcpyDtoH(Pointer.to(hostMatrixC), devMatrixC, numRows * numCols * Sizeof.FLOAT);
//verify the result
for (int i =0; i<numRows; i++)
{
for (int j =0; j<numRows; j++)
{
System.out.print(" "+ hostMatrixC[i*numCols+j]);
}
System.out.println("");
}
cuMemFree(devMatrixA);
cuMemFree(devMatrixB);
cuMemFree(devMatrixC);
}
private static String preparePtxFile(String cuFileName) throws IOException
{
int endIndex = cuFileName.lastIndexOf('.');
if (endIndex == -1)
endIndex = cuFileName.length()-1;
{
}
String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
File ptxFile = new File(ptxFileName);
if (ptxFile.exists())
{
return ptxFileName;
}
File cuFile = new File(cuFileName);
if (!cuFile.exists())
{
throw new IOException("Input file not found: "+cuFileName);
}
String modelString = "-m"+System.getProperty("sun.arch.data.model");
String command = "nvcc " + modelString + " -ptx "+ cuFile.getPath()+" -o "+ptxFileName;
System.out.println("Executing\n"+command);
Process process = Runtime.getRuntime().exec(command);
String errorMessage = new String(toByteArray(process.getErrorStream()));
String outputMessage = new String(toByteArray(process.getInputStream()));
int exitValue = 0;
try
{
exitValue = process.waitFor();
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new IOException(
"Interrupted while waiting for nvcc output", e);
}
if (exitValue != 0)
{
System.out.println("nvcc process exitValue "+exitValue);
System.out.println("errorMessage:\n"+errorMessage);
System.out.println("outputMessage:\n"+outputMessage);
throw new IOException(
"Could not create .ptx file: "+errorMessage);
}
System.out.println("Finished creating PTX file");
return ptxFileName;
}
private static byte[] toByteArray(InputStream inputStream) throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte buffer[] = new byte[8192];
while (true)
{
int read = inputStream.read(buffer);
if (read == -1)
{
break;
}
baos.write(buffer, 0, read);
}
return baos.toByteArray();
}
}
当我将其定义为128时,它会给出与上面相同的错误。但当我把它设为129时,它会给我正确的输出!当我给出129到49024之间的任何值时,它会给出正确的结果。
我的问题是,当我将其定义为128时,为什么不能获得正确的输出?另外,可以定义的最大共享内存是多少?为什么129-49024系列在这里工作?您正在启动16x16线程块:
cuLaunchKernel(function,
gridSize, 1, 1,
blockSize, 16, 1, <-- the first two params are block.x and block.y
250, null, kernelParameters, null);
你的代码在这方面被破坏了。如果使用cuda memcheck
运行代码,它可能会捕获这些越界访问,即使是在“通过”的情况下。查看矩阵muldrv
,会很有启发性,您会看到共享内存分配是2*块大小*块大小
,对于您的情况也是如此,但您的共享内存定义应该是[16][16]
而不是[4][4]
共享内存分配粒度可能恰好在超过128字节时起作用,但代码中存在缺陷
您的共享定义应为:
__shared__ float ads[16][16];
__shared__ float bds[16][16];
由于上面的分配是静态分配,并且sharedMemBytes
参数是动态共享内存分配,因此在本例中,您不需要分配任何(0是正常的)动态共享内存,并且它仍然有效。其中包括静态和动态之间的差异
每个块的最大共享内存在中可用,或者如果您运行cuda
deviceQuery
示例代码。对于cc2.0和更高版本的设备,它是48K字节。现在正准备写一个答案-我花了更长的时间才弄明白;-)+1@Robert克罗维拉:我理解我在这里声明共享记忆的错误。我已经更正了内核代码,并将其改为:“共享浮点ads[16][16]”和“共享浮点bds[16][16]”。现在在本例中,我不必定义“sharedMemBytes”值。它甚至对值0也有效。这种行为正确吗?除了Robert Crovella指出的错误之外:您的陈述是正确的:给定2*float[4][4]
共享数组,参数应该是128
。(但阵列必须更大,因此在修复错误后,还必须更新sharedMemBytes
)。Robert还提到的deviceQuery
程序显示了共享内存的最大大小(每个块!),JCuda版本可在@Macro13获得:我理解我在这里声明共享内存的错误。我已经更正了内核代码,并将其改为:“共享浮点ads[16][16]”和“共享浮点bds[16][16]”。现在在本例中,我不必定义“sharedMemBytes”值。它甚至对值0也有效。这种行为正确吗?
cuLaunchKernel(function,
gridSize, 1, 1,
blockSize, 16, 1, <-- the first two params are block.x and block.y
250, null, kernelParameters, null);
ads[ty][tx] = ad[Row * N + (i * TILE) + tx];
bds[ty][tx] = bd[(i * TILE + ty) * N + Col];
^ ^
| tx goes from 0..15 for a 16x16 threadblock
ty goes from 0..15 for a 16x16 threadblock
__shared__ float ads[16][16];
__shared__ float bds[16][16];