Cuda 监控线程块如何跨执行时间分配给SMs?
我是CUDA评测的初学者。我基本上想生成一个时间线,显示每个SM以及在执行期间分配给它的线程块 类似于此: 作者:Sreepathi Pai 我读过有关读取%smid寄存器的内容,但我不知道如何将其与我要测试的代码结合起来,或者如何将其与线程块或时间关联起来Cuda 监控线程块如何跨执行时间分配给SMs?,cuda,Cuda,我是CUDA评测的初学者。我基本上想生成一个时间线,显示每个SM以及在执行期间分配给它的线程块 类似于此: 作者:Sreepathi Pai 我读过有关读取%smid寄存器的内容,但我不知道如何将其与我要测试的代码结合起来,或者如何将其与线程块或时间关联起来 __noinline__ __device__ uint get_smid(void) { uint ret; asm("mov.u32 %0, %smid;" : "=r"(ret) ); return ret;
__noinline__ __device__ uint get_smid(void)
{
uint ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}
源代码。完整代码超出了此答案的范围,因此此答案为您提供了实现块跟踪的构建块
static __device__ inline uint32_t __smid()
{
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;
}
// use globaltimer for compute capability >= 3.0 (kepler and maxwell)
// use clock64 for compute capability 2.x (fermi)
static __device__ inline uint64_t __timestamp()
{
uint64_t globaltime;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(globaltime) );
return globaltime;
}
__global__ blocktime(uint64_t* pBlockTime)
{
// START TIMESTAMP
uint64_t startTime = __timestamp();
// flatBlockIdx should be adjusted to 1D, 2D, and 3D launches to minimize
// overhead. Reduce to uint32_t if launch index does not exceed 32-bit.
uint64_t flatBlockIdx = (blockIdx.z * gridDim.x * gridDim.y)
+ (blockIdx.y * gridDim.x)
+ blockIdx.x;
// reduce this based upon dimensions of block to minimize overhead
if (threadIdx.x == 0 && theradIdx.y == 0 && threadIdx.z == 0)
{
// Put the smid in the 4 lower bits. If the MultiprocessCounter exceeds
// 16 then increase to 5-bits. The lower 5-bits of globaltimer are
// junk. If using clock64 and you want the improve precision then use
// the most significant 4-5 bits.
uint64_t smid = __smid();
uint64_t data = (startTime & 0xF) | smid;
pBlockTime[flatBlockIdx * 2 + 0] = data;
}
// do work
// I would recommend changing your current __global__ function to be
// a __global__ __device__ function and call it here. This will result
// in easier handling of kernels that have multiple exit points.
// END TIMESTAMP
// All threads in block will write out. This is not very efficient.
// Depending on the kernel this can be reduced to 1 thread or 1 thread per warp.
uint64_t endTime = __timestamp();
pBlockTime[flatBlockIdx * 2 + 1] = endTime;
}
非常感谢。我看到了,我不知道该放在哪里?它是一个设备函数,返回SM ID,任何线程都可以使用
get\u smid()
调用。把它当作是一个被线程从GPU内核内部调用的C函数。是的,但是-调用它之后,我把值放在哪里?如果它在全局内存中-这对我试图分析的SM块分配有很大影响;printf()ing也是如此。我猜第一条注释中的“that”指的是函数定义。我认为这实际上取决于程序员根据应用程序请求的资源决定将返回值放在何处,以避免影响原始CUDA应用程序行为;如果这是她的意思,非常感谢你的帮助。我已经使用了提供的代码,它做了我想要的。