Parallel processing OpenCL中FFT输入序列大小的未知问题

Parallel processing OpenCL中FFT输入序列大小的未知问题,parallel-processing,opencl,gpu,fft,gpgpu,Parallel Processing,Opencl,Gpu,Fft,Gpgpu,各位,, 我最近尝试在OpenCL中实现快速傅立叶变换,并尝试了《OpenCL在行动》一书中提供的一些代码。以下代码旨在通过在位反转后执行4分量FFT来初始化输入序列(或离散信号): fft.cl: __kernel void fft_init( __global float2* g_data, __global float2* l_data, uint points_per_group, uint size, int dir ) { uint g_addr, l_addr, points_

各位,, 我最近尝试在OpenCL中实现快速傅立叶变换,并尝试了《OpenCL在行动》一书中提供的一些代码。以下代码旨在通过在位反转后执行4分量FFT来初始化输入序列(或离散信号):

fft.cl:

__kernel void fft_init( __global float2* g_data, __global float2* l_data, uint points_per_group, uint size, int dir )

{

uint g_addr, l_addr, points_per_item;
points_per_item = points_per_group / get_local_size( 0 );

l_addr = get_local_id( 0 ) * points_per_item;

g_addr = get_group_id( 0 ) * points_per_group + l_addr;


uint4 index;
uint mask_left, mask_right, shift_pos;
uint4 br;

float2 x1, x2, x3, x4;
float2 sum12, diff12, sum34, diff34;

for (int i = 0; i < points_per_item; i += 4)
{
    index = (uint4){ g_addr, g_addr + 1, g_addr + 2, g_addr + 3 };

    mask_left = size / 2;
    mask_right = 1;

    shift_pos = log2( ( float ) size ) - 1;

    while( shift_pos > 1 )
    {
        br = ( index << shift_pos ) & mask_left;
        br |= ( index >> shift_pos ) & mask_right;

        mask_left >>= 1;
        mask_right <<= 1;
        shift_pos -= 2;
    }

    x1 = g_data[ br.s0 ];
    x2 = g_data[ br.s1 ];
    x3 = g_data[ br.s2 ];
    x4 = g_data[ br.s3 ];

    sum12 = x1 + x2;
    diff12 = x1 - x2;
    sum34 = x3 + x4;
    diff34 = ( float2 ){
        x3.s1 - x4.s1,
        x4.s0 - x3.s1
    };

    l_data[ l_addr ] = sum12 + sum34;
    l_data[ l_addr + 1 ] = diff12 + diff34;
    l_data[ l_addr + 2 ] = sum12 - sum34;
    l_data[ l_addr + 3 ] = diff12 - diff34;

    g_addr += 4;
    l_addr += 4;
}

}

你知道这个奇怪的问题吗-o

查看主机程序可能会有用。请提供一些部分,特别是内核调用部分;它在Python中使用pyopencl。
def setInputs( self ):

    self.numPoints = 8

    self.points = ( 1.0, 1.0 ) * self.numPoints

    self.pointsArray = numpy.array( self.points, dtype = numpy.float32 )

    self.resultArray = numpy.array( ( -1.0, -1.0 ) * self.numPoints, dtype = numpy.float32 )


    self.pointsBuffer = cl.Buffer( self.context,
                                   cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
                                   hostbuf = self.pointsArray
                                   )

    self.resultBuffer = cl.Buffer( self.context,
                                   cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
                                   hostbuf = self.resultArray
                                   )


def runKernel( self ):

    globalWorkSize = ( 1, )

    localWorkSize = ( 1, )

    print 'Input points : ', self.pointsArray

    event = self.program.fft_init( self.commandQueues[ 0 ],
                                   globalWorkSize,
                                   localWorkSize,
                                   self.pointsBuffer,
                                   self.resultBuffer,
                                   numpy.uint32( 8 ),
                                   numpy.uint32( 8 ),
                                   numpy.int32( 1 )
                                   )

    event.wait()

    print 'Time Consumption : ', ( event.profile.end - event.profile.start ) * 1e-9, ' seconds'

    cl.enqueue_copy( self.commandQueues[ 0 ],
                     self.resultArray,
                     self.resultBuffer
                     ).wait()

    print 'Result : ', self.resultArray