Opencl Stockham FFT的更高基数(或更好)配方
背景 我已经使用OpenCL实现了微软研究院的基2 FFT(Stockham自动排序)算法 我在内核中使用浮点纹理(256列X N行)进行输入和输出,因为我需要在非整数点进行采样,我认为最好将其委托给纹理采样硬件。请注意,我的FFT始终是256点序列(纹理中的每一行)。此时,我的N是16384或32768,具体取决于我使用的GPU和允许的最大2D纹理大小 我还需要一次执行4个实值序列的FFT,因此内核执行FFT(a,b,c,d)作为FFT(a+ib,c+id),我可以在以后使用O(n)算法从中提取4个复序列。如果有人愿意,我可以详细说明这一点,但我认为这不属于这个问题的范围 内核源代码Opencl Stockham FFT的更高基数(或更好)配方,opencl,fft,Opencl,Fft,背景 我已经使用OpenCL实现了微软研究院的基2 FFT(Stockham自动排序)算法 我在内核中使用浮点纹理(256列X N行)进行输入和输出,因为我需要在非整数点进行采样,我认为最好将其委托给纹理采样硬件。请注意,我的FFT始终是256点序列(纹理中的每一行)。此时,我的N是16384或32768,具体取决于我使用的GPU和允许的最大2D纹理大小 我还需要一次执行4个实值序列的FFT,因此内核执行FFT(a,b,c,d)作为FFT(a+ib,c+id),我可以在以后使用O(n)算法从中提
const sampler_t fftSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void FFT_Stockham(read_only image2d_t input, write_only image2d_t output, int fftSize, int size)
{
int x = get_global_id(0);
int y = get_global_id(1);
int b = floor(x / convert_float(fftSize)) * (fftSize / 2);
int offset = x % (fftSize / 2);
int x0 = b + offset;
int x1 = x0 + (size / 2);
float4 val0 = read_imagef(input, fftSampler, (int2)(x0, y));
float4 val1 = read_imagef(input, fftSampler, (int2)(x1, y));
float angle = -6.283185f * (convert_float(x) / convert_float(fftSize));
// TODO: Convert the two calculations below into lookups from a __constant buffer
float tA = native_cos(angle);
float tB = native_sin(angle);
float4 coeffs1 = (float4)(tA, tB, tA, tB);
float4 coeffs2 = (float4)(-tB, tA, -tB, tA);
float4 result = val0 + coeffs1 * val1.xxzz + coeffs2 * val1.yyww;
write_imagef(output, (int2)(x, y), result);
}
宿主代码仅调用此内核log2(256)次,对输入和输出纹理进行乒乓处理
注意:我试着删除native_cos和native_sin,看看这是否影响了时间,但似乎没有太大的改变。无论如何,这不是我要找的因素
访问模式
知道我可能是内存带宽受限,下面是我的基数2 FFT的内存访问模式(每行)
- X0-元件1合并(读取)
- X1-要合并的元件2(读取)
- X-要写入的元素(写入)
提前感谢您阅读所有这些内容并尝试提供帮助 我尝试了几个版本,但在CPU和GPU上性能最好的版本是针对我的特定情况的radix-16内核 这里是内核供参考。这是从埃里克·班维尔(最优秀的)的作品中摘取的,并与完全归属一起使用
// #define M_PI 3.14159265358979f
//Global size is x.Length/2, Scale = 1 for direct, 1/N to inverse (iFFT)
__kernel void ConjugateAndScale(__global float4* x, const float Scale)
{
int i = get_global_id(0);
float temp = Scale;
float4 t = (float4)(temp, -temp, temp, -temp);
x[i] *= t;
}
// Return a*EXP(-I*PI*1/2) = a*(-I)
float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
// Return a^2
float2 sqr_1(float2 a)
{ return (float2)(a.x*a.x-a.y*a.y,2.0f*a.x*a.y); }
// Return the 2x DFT2 of the four complex numbers in A
// If A=(a,b,c,d) then return (a',b',c',d') where (a',c')=DFT2(a,c)
// and (b',d')=DFT2(b,d).
float8 dft2_4(float8 a) { return (float8)(a.lo+a.hi,a.lo-a.hi); }
// Return the DFT of 4 complex numbers in A
float8 dft4_4(float8 a)
{
// 2x DFT2
float8 x = dft2_4(a);
// Shuffle, twiddle, and 2x DFT2
return dft2_4((float8)(x.lo.lo,x.hi.lo,x.lo.hi,mul_p1q2(x.hi.hi)));
}
// Complex product, multiply vectors of complex numbers
#define MUL_RE(a,b) (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a,b) (a.even*b.odd + a.odd*b.even)
float2 mul_1(float2 a, float2 b)
{ float2 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_1_F4(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_2(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
// Return the DFT2 of the two complex numbers in vector A
float4 dft2_2(float4 a) { return (float4)(a.lo+a.hi,a.lo-a.hi); }
// Return cos(alpha)+I*sin(alpha) (3 variants)
float2 exp_alpha_1(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
//cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float2)(cs,sn);
}
// Return cos(alpha)+I*sin(alpha) (3 variants)
float4 exp_alpha_1_F4(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
// cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float4)(cs,sn,cs,sn);
}
// mul_p*q*(a) returns a*EXP(-I*PI*P/Q)
#define mul_p0q1(a) (a)
#define mul_p0q2 mul_p0q1
//float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
__constant float SQRT_1_2 = 0.707106781186548; // cos(Pi/4)
#define mul_p0q4 mul_p0q2
float2 mul_p1q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(a.x+a.y,-a.x+a.y); }
#define mul_p2q4 mul_p1q2
float2 mul_p3q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(-a.x+a.y,-a.x-a.y); }
__constant float COS_8 = 0.923879532511287; // cos(Pi/8)
__constant float SIN_8 = 0.382683432365089; // sin(Pi/8)
#define mul_p0q8 mul_p0q4
float2 mul_p1q8(float2 a) { return mul_1((float2)(COS_8,-SIN_8),a); }
#define mul_p2q8 mul_p1q4
float2 mul_p3q8(float2 a) { return mul_1((float2)(SIN_8,-COS_8),a); }
#define mul_p4q8 mul_p2q4
float2 mul_p5q8(float2 a) { return mul_1((float2)(-SIN_8,-COS_8),a); }
#define mul_p6q8 mul_p3q4
float2 mul_p7q8(float2 a) { return mul_1((float2)(-COS_8,-SIN_8),a); }
// Compute in-place DFT2 and twiddle
#define DFT2_TWIDDLE(a,b,t) { float2 tmp = t(a-b); a += b; b = tmp; }
// T = N/16 = number of threads.
// P is the length of input sub-sequences, 1,16,256,...,N/16.
__kernel void FFT_Radix16(__global const float4 * x, __global float4 * y, int pp)
{
int p = pp;
int t = get_global_size(0); // number of threads
int i = get_global_id(0); // current thread
////// y[i] = 2*x[i];
////// return;
int k = i & (p-1); // index in input sequence, in 0..P-1
// Inputs indices are I+{0,..,15}*T
x += i;
// Output indices are J+{0,..,15}*P, where
// J is I with four 0 bits inserted at bit log2(P)
y += ((i-k)<<4) + k;
// Load
float4 u[16];
for (int m=0;m<16;m++) u[m] = x[m*t];
// Twiddle, twiddling factors are exp(_I*PI*{0,..,15}*K/4P)
float alpha = -M_PI*(float)k/(float)(8*p);
for (int m=1;m<16;m++) u[m] = mul_1_F4(exp_alpha_1_F4(m * alpha), u[m]);
// 8x in-place DFT2 and twiddle (1)
DFT2_TWIDDLE(u[0].lo,u[8].lo,mul_p0q8);
DFT2_TWIDDLE(u[0].hi,u[8].hi,mul_p0q8);
DFT2_TWIDDLE(u[1].lo,u[9].lo,mul_p1q8);
DFT2_TWIDDLE(u[1].hi,u[9].hi,mul_p1q8);
DFT2_TWIDDLE(u[2].lo,u[10].lo,mul_p2q8);
DFT2_TWIDDLE(u[2].hi,u[10].hi,mul_p2q8);
DFT2_TWIDDLE(u[3].lo,u[11].lo,mul_p3q8);
DFT2_TWIDDLE(u[3].hi,u[11].hi,mul_p3q8);
DFT2_TWIDDLE(u[4].lo,u[12].lo,mul_p4q8);
DFT2_TWIDDLE(u[4].hi,u[12].hi,mul_p4q8);
DFT2_TWIDDLE(u[5].lo,u[13].lo,mul_p5q8);
DFT2_TWIDDLE(u[5].hi,u[13].hi,mul_p5q8);
DFT2_TWIDDLE(u[6].lo,u[14].lo,mul_p6q8);
DFT2_TWIDDLE(u[6].hi,u[14].hi,mul_p6q8);
DFT2_TWIDDLE(u[7].lo,u[15].lo,mul_p7q8);
DFT2_TWIDDLE(u[7].hi,u[15].hi,mul_p7q8);
// 8x in-place DFT2 and twiddle (2)
DFT2_TWIDDLE(u[0].lo,u[4].lo,mul_p0q4);
DFT2_TWIDDLE(u[0].hi,u[4].hi,mul_p0q4);
DFT2_TWIDDLE(u[1].lo,u[5].lo,mul_p1q4);
DFT2_TWIDDLE(u[1].hi,u[5].hi,mul_p1q4);
DFT2_TWIDDLE(u[2].lo,u[6].lo,mul_p2q4);
DFT2_TWIDDLE(u[2].hi,u[6].hi,mul_p2q4);
DFT2_TWIDDLE(u[3].lo,u[7].lo,mul_p3q4);
DFT2_TWIDDLE(u[3].hi,u[7].hi,mul_p3q4);
DFT2_TWIDDLE(u[8].lo,u[12].lo,mul_p0q4);
DFT2_TWIDDLE(u[8].hi,u[12].hi,mul_p0q4);
DFT2_TWIDDLE(u[9].lo,u[13].lo,mul_p1q4);
DFT2_TWIDDLE(u[9].hi,u[13].hi,mul_p1q4);
DFT2_TWIDDLE(u[10].lo,u[14].lo,mul_p2q4);
DFT2_TWIDDLE(u[10].hi,u[14].hi,mul_p2q4);
DFT2_TWIDDLE(u[11].lo,u[15].lo,mul_p3q4);
DFT2_TWIDDLE(u[11].hi,u[15].hi,mul_p3q4);
// 8x in-place DFT2 and twiddle (3)
DFT2_TWIDDLE(u[0].lo,u[2].lo,mul_p0q2);
DFT2_TWIDDLE(u[0].hi,u[2].hi,mul_p0q2);
DFT2_TWIDDLE(u[1].lo,u[3].lo,mul_p1q2);
DFT2_TWIDDLE(u[1].hi,u[3].hi,mul_p1q2);
DFT2_TWIDDLE(u[4].lo,u[6].lo,mul_p0q2);
DFT2_TWIDDLE(u[4].hi,u[6].hi,mul_p0q2);
DFT2_TWIDDLE(u[5].lo,u[7].lo,mul_p1q2);
DFT2_TWIDDLE(u[5].hi,u[7].hi,mul_p1q2);
DFT2_TWIDDLE(u[8].lo,u[10].lo,mul_p0q2);
DFT2_TWIDDLE(u[8].hi,u[10].hi,mul_p0q2);
DFT2_TWIDDLE(u[9].lo,u[11].lo,mul_p1q2);
DFT2_TWIDDLE(u[9].hi,u[11].hi,mul_p1q2);
DFT2_TWIDDLE(u[12].lo,u[14].lo,mul_p0q2);
DFT2_TWIDDLE(u[12].hi,u[14].hi,mul_p0q2);
DFT2_TWIDDLE(u[13].lo,u[15].lo,mul_p1q2);
DFT2_TWIDDLE(u[13].hi,u[15].hi,mul_p1q2);
// 8x DFT2 and store (reverse binary permutation)
y[0] = u[0] + u[1];
y[p] = u[8] + u[9];
y[2*p] = u[4] + u[5];
y[3*p] = u[12] + u[13];
y[4*p] = u[2] + u[3];
y[5*p] = u[10] + u[11];
y[6*p] = u[6] + u[7];
y[7*p] = u[14] + u[15];
y[8*p] = u[0] - u[1];
y[9*p] = u[8] - u[9];
y[10*p] = u[4] - u[5];
y[11*p] = u[12] - u[13];
y[12*p] = u[2] - u[3];
y[13*p] = u[10] - u[11];
y[14*p] = u[6] - u[7];
y[15*p] = u[14] - u[15];
}
/#定义M#u PI 3.14159265358979f
//全局大小为x.Length/2,比例=1表示直接,1/N表示反向(iFFT)
__内核无效共轭数字刻度(uu全局浮点4*x,常量浮点刻度)
{
int i=获取全局id(0);
浮子温度=刻度;
浮动4 t=(浮动4)(温度-温度,温度-温度);
x[i]*=t;
}
//返回a*EXP(-I*PI*1/2)=a*(-I)
float2 mul_p1q2(float2 a){return(float2)(a.y,-a.x);}
//返回^2
浮动2 sqr_1(浮动2 a)
{return(float2)(a.x*a.x-a.y*a.y,2.0f*a.x*a.y);}
//返回A中四个复数的2x DFT2
//如果A=(A,b,c,d),则返回(A',b',c',d'),其中(A',c')=DFT2(A,c)
//和(b',d')=DFT2(b,d)。
float8dft2_4(float8a){return(float8)(a.lo+a.hi,a.lo-a.hi);}
//返回A中4个复数的DFT
浮动8 dft4_4(浮动8 a)
{
//2x DFT2
float8x=dft2_4(a);
//洗牌、旋转和2倍DFT2
返回dft2_4((浮动8)(x.lo.lo,x.hi.lo,x.lo.hi,mul_p1q2(x.hi.hi));
}
//复数乘积,复数向量的乘积
#定义MUL_RE(a,b)(a.偶数*b.偶数-a.奇数*b.奇数)
#定义多个IM(a,b)(a.偶数*b.奇数+a.奇数*b.偶数)
浮点数2 mul_1(浮点数2 a、浮点数2 b)
{float2x;x.偶=MUL_RE(a,b);x.奇=MUL_IM(a,b);返回x;}
浮动4多个浮动1个浮动F4(浮动4 a,浮动4 b)
{float4x;x.偶=MUL_RE(a,b);x.奇=MUL_IM(a,b);返回x;}
浮点数4 mul_2(浮点数4 a、浮点数4 b)
{float4x;x.偶=MUL_RE(a,b);x.奇=MUL_IM(a,b);返回x;}
//返回向量A中两个复数的DFT2
float4dft2_2(float4a){return(float4)(a.lo+a.hi,a.lo-a.hi);}
//返回cos(alpha)+I*sin(alpha)(3种变体)
浮动2 exp_alpha_1(浮动alpha)
{
浮子cs,sn;
//sn=sincos(alpha,&cs);//sincos
//cs=native_cos(alpha);sn=native_sin(alpha);//native sin+cos
cs=cos(alpha);sn=sin(alpha);//sin+cos
返回(浮动2)(cs,序列号);
}
//返回cos(alpha)+I*sin(alpha)(3种变体)
浮动4 exp_alpha_1_F4(浮动alpha)
{
浮子cs,sn;
//sn=sincos(alpha,&cs);//sincos
//cs=native_cos(alpha);sn=native_sin(alpha);//native sin+cos
cs=cos(alpha);sn=sin(alpha);//sin+cos
返回(浮动4)(cs、序列号、cs、序列号);
}
//mul_p*q*(a)返回一个*EXP(-I*PI*p/q)
#定义mul_p0q1(a)(a)
#定义多个p0q2多个p0q1
//float2 mul_p1q2(float2 a){return(float2)(a.y,-a.x);}
__常量浮点SQRT_1_2=0.707106781186548;//cos(Pi/4)
#定义多个p0q4多个p0q2
float2 mul_p1q4(float2 a){返回(float2)(SQRT_1_2)*(float2)(a.x+a.y,-a.x+a.y);}
#定义多个p2q4多个p1q2
float2 mul_p3q4(float2 a){return(float2)(SQRT_1_2)*(float2)(-a.x+a.y,-a.x-a.y)}
__恒定浮动COS_8=0.9238795352511287;//cos(Pi/8)
__恒定浮动SIN_8=0.382683432365089;//sin(Pi/8)
#定义多个p0q8多个p0q4
float2 mul_p1q8(float2 a){返回mul_1((float2)(COS_8,-SIN_8),a)}
#定义多个p2q8多个p1q4
float2 mul_p3q8(float2 a){返回mul_1((float2)(SIN_8,-COS_8),a)}
#定义多个p4q8多个p2q4
float2 mul_p5q8(float2 a){返回mul_1((float2)(-SIN_8,-COS_8),a)}
#定义多个p6q8多个p3q4
float2 mul_p7q8(float2 a){返回mul_1((float2)(-COS_8,-SIN_8),a)}
//就地计算DFT2并旋转
#定义DFT2_旋转(a,b,t){float2tmp=t(a-b);a+=b;b=tmp;}
//T=N/16=螺纹数。
//P是输入子序列的长度,1,16256,…,N/16。
__内核无效FFT半径16(uuu全局常量浮点4*x,uuu全局浮点4*y,int-pp)
{
int p=pp;
int t=get_global_size(0);//线程数
int i=
var ev = new[] { new Cl.Event() };
var pEv = new[] { new Cl.Event() };
int fftSize = 1;
int iter = 0;
int n = distributionSize >> 5;
while (fftSize <= n)
{
Cl.SetKernelArg(fftKernel, 0, memA);
Cl.SetKernelArg(fftKernel, 1, memB);
Cl.SetKernelArg(fftKernel, 2, fftSize);
Cl.EnqueueNDRangeKernel(commandQueue, fftKernel, 1, null, globalWorkgroupSize, localWorkgroupSize,
(uint)(iter == 0 ? 0 : 1),
iter == 0 ? null : pEv,
out ev[0]).Check();
if (iter > 0)
pEv[0].Dispose();
Swap(ref ev, ref pEv);
Swap(ref memA, ref memB); // ping-pong
fftSize = fftSize << 4;
iter++;
Cl.Finish(commandQueue);
}
Swap(ref memA, ref memB);