Здесь приведен алгоритм БПФ для произвольного N, вычисляемый через свертку. Данный алгоритм может быть еще значительно ускорен двумя путями.
В начале функции проверяется, что
Далее, возможно усовершенствование: не надеяться на компилятор, который должен (при удачном раскладе) оптимизировать вычисления в главном цикле fft_step, а сделать оптимизацию явно. В конце этой страницы приведен оптимизированный вариант функции fft_step. Я его привел отдельно, так как он гораздо менее наглядный. Он рассчитывает на то, что серия структур ShortComplex в памяти представляет собой последовательность double, double, double,... где чередуются реальные и мнимые части.
/* Fast Fourier Transformation ==================================================== Coded by Miroslav Voinarovsky, 2002 This source is freeware. */ #ifndef FFT_H_ #define FFT_H_ struct Complex; struct ShortComplex; /* Fast Fourier Transformation x: x - array of items N: N - number of items in array complement: false - normal (direct) transformation, true - reverse transformation */ extern void universal_fft(ShortComplex *x, int N, bool complement); struct ShortComplex { double re, im; inline void operator=(const Complex &y); }; struct Complex { long double re, im; inline void operator= (const Complex &y); inline void operator= (const ShortComplex &y); }; inline void ShortComplex::operator=(const Complex &y) { re = (double)y.re; im = (double)y.im; } inline void Complex::operator= (const Complex &y) { re = y.re; im = y.im; } inline void Complex::operator= (const ShortComplex &y) { re = y.re; im = y.im; } #endif
/* Fast Fourier Transformation ==================================================== Coded by Miroslav Voinarovsky, 2006 This source is freeware. */ #include "fft.h" #include <math.h> // This array contains values from 0 to 255 with reverse bit order static unsigned char reverse256[]= { 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF, }; //This is minimized version of type 'complex'. All operations is inline static long double temp; inline void operator+=(ShortComplex &x, const Complex &y) { x.re += (double)y.re; x.im += (double)y.im; } inline void operator-=(ShortComplex &x, const Complex &y) { x.re -= (double)y.re; x.im -= (double)y.im; } inline void operator*=(Complex &x, const Complex &y) { temp = x.re; x.re = temp * y.re - x.im * y.im; x.im = temp * y.im + x.im * y.re; } inline void operator*=(Complex &x, const ShortComplex &y) { temp = x.re; x.re = temp * y.re - x.im * y.im; x.im = temp * y.im + x.im * y.re; } inline void operator/=(ShortComplex &x, double div) { x.re /= div; x.im /= div; } inline void operator/=(Complex &x, double div) { x.re /= div; x.im /= div; } inline void operator*=(ShortComplex&x, const ShortComplex &y) { double temp = x.re; x.re = temp * y.re - x.im * y.im; x.im = temp * y.im + x.im * y.re; } //This is array exp(-2*pi*j/2^n) for n= 1,...,32 //exp(-2*pi*j/2^n) = Complex( cos(2*pi/2^n), -sin(2*pi/2^n) ) static Complex W2n[32]={ {-1.00000000000000000000000000000000, 0.00000000000000000000000000000000}, // W2 calculator (copy/paste) : po, ps { 0.00000000000000000000000000000000, -1.00000000000000000000000000000000}, // W4: p/2=o, p/2=s { 0.70710678118654752440084436210485, -0.70710678118654752440084436210485}, // W8: p/4=o, p/4=s { 0.92387953251128675612818318939679, -0.38268343236508977172845998403040}, // p/8=o, p/8=s { 0.98078528040323044912618223613424, -0.19509032201612826784828486847702}, // p/16= { 0.99518472667219688624483695310948, -9.80171403295606019941955638886e-2}, // p/32= { 0.99879545620517239271477160475910, -4.90676743274180142549549769426e-2}, // p/64= { 0.99969881869620422011576564966617, -2.45412285229122880317345294592e-2}, // p/128= { 0.99992470183914454092164649119638, -1.22715382857199260794082619510e-2}, // p/256= { 0.99998117528260114265699043772857, -6.13588464915447535964023459037e-3}, // p/(2y9)= { 0.99999529380957617151158012570012, -3.06795676296597627014536549091e-3}, // p/(2y10)= { 0.99999882345170190992902571017153, -1.53398018628476561230369715026e-3}, // p/(2y11)= { 0.99999970586288221916022821773877, -7.66990318742704526938568357948e-4}, // p/(2y12)= { 0.99999992646571785114473148070739, -3.83495187571395589072461681181e-4}, // p/(2y13)= { 0.99999998161642929380834691540291, -1.91747597310703307439909561989e-4}, // p/(2y14)= { 0.99999999540410731289097193313961, -9.58737990959773458705172109764e-5}, // p/(2y15)= { 0.99999999885102682756267330779455, -4.79368996030668845490039904946e-5}, // p/(2y16)= { 0.99999999971275670684941397221864, -2.39684498084182187291865771650e-5}, // p/(2y17)= { 0.99999999992818917670977509588385, -1.19842249050697064215215615969e-5}, // p/(2y18)= { 0.99999999998204729417728262414778, -5.99211245264242784287971180889e-6}, // p/(2y19)= { 0.99999999999551182354431058417300, -2.99605622633466075045481280835e-6}, // p/(2y20)= { 0.99999999999887795588607701655175, -1.49802811316901122885427884615e-6}, // p/(2y21)= { 0.99999999999971948897151921479472, -7.49014056584715721130498566730e-7}, // p/(2y22)= { 0.99999999999992987224287980123973, -3.74507028292384123903169179084e-7}, // p/(2y23)= { 0.99999999999998246806071995015625, -1.87253514146195344868824576593e-7}, // p/(2y24)= { 0.99999999999999561701517998752946, -9.36267570730980827990672866808e-8}, // p/(2y25)= { 0.99999999999999890425379499688176, -4.68133785365490926951155181385e-8}, // p/(2y26)= { 0.99999999999999972606344874922040, -2.34066892682745527595054934190e-8}, // p/(2y27)= { 0.99999999999999993151586218730510, -1.17033446341372771812462135032e-8}, // p/(2y28)= { 0.99999999999999998287896554682627, -5.85167231706863869080979010083e-9}, // p/(2y29)= { 0.99999999999999999571974138670657, -2.92583615853431935792823046906e-9}, // p/(2y30)= { 0.99999999999999999892993534667664, -1.46291807926715968052953216186e-9}, // p/(2y31)= }; #define M_PI (3.1415926535897932384626433832795) inline void complex_mul(ShortComplex *z, const ShortComplex *z1, const Complex *z2) { z->re = (double)(z1->re * z2->re - z1->im * z2->im); z->im = (double)(z1->re * z2->im + z1->im * z2->re); } static ShortComplex *createWstore(unsigned int Nmax) { unsigned int N, Skew, Skew2; ShortComplex *Wstore, *Warray, *WstoreEnd; Complex WN, *pWN; Skew2 = Nmax >> 1; Wstore = new ShortComplex[Skew2]; WstoreEnd = Wstore + Skew2; Wstore[0].re = 1.0; Wstore[0].im = 0.0; for(N = 4, pWN = W2n + 1, Skew = Skew2 >> 1; N <= Nmax; N += N, pWN++, Skew2 = Skew, Skew >>= 1) { //WN = W(1, N) = exp(-2*pi*j/N) WN= *pWN; for(Warray = Wstore; Warray < WstoreEnd; Warray += Skew2) complex_mul(Warray + Skew, Warray, &WN); } return Wstore; } static void fft_step(ShortComplex *x, unsigned int T, bool complement, const ShortComplex *Wstore) { unsigned int Nmax, I, J, N, Nd2, k, m, Skew, mpNd2, Step; unsigned char *Ic = (unsigned char*) &I; unsigned char *Jc = (unsigned char*) &J; ShortComplex S; const ShortComplex *Warray; Complex Temp; Nmax = 1 << T; //first interchanging for(I = 1; I < Nmax - 1; I++) { Jc[0] = reverse256[Ic[3]]; Jc[1] = reverse256[Ic[2]]; Jc[2] = reverse256[Ic[1]]; Jc[3] = reverse256[Ic[0]]; J >>= (32 - T); if (I < J) { S = x[I]; x[I] = x[J]; x[J] = S; } } //main loop for(N = 2, Nd2 = 1, Skew = Nmax >> 1, Step= 1; N <= Nmax; Nd2 = N, N += N, Skew >>= 1, Step++) { for(Warray = Wstore, k = 0; k < Nd2; k++, Warray += Skew) { for(m = k; m < Nmax; m += N) { Temp = *Warray; if (complement) Temp.im= -Temp.im; mpNd2= m + Nd2; Temp *= x[mpNd2]; x[mpNd2] = x[m]; x[mpNd2] -= Temp; x[m] += Temp; } } } } /* x: x - array of items N: N - number of items in array complement: false - normal (direct) transformation, true - reverse transformation */ void universal_fft(ShortComplex *x, int N, bool complement) { ShortComplex *x_; ShortComplex *w; ShortComplex *Wstore; int T; T= (int)floor(log((double)N) / log(2.0) + 0.5); if (1 << T == N) { Wstore= createWstore(N); fft_step(x, T, complement, Wstore); if (complement) { for(int i= 0; i < N; i++) x[i]/= N; } delete [] Wstore; return; } //find N', T int N2= N+N; int N_; long double arg; for(N_= 1, T= 0; N_ < N2; N_+= N_, T++) { } //find --2pi/N/2 = pi/N long double piN= M_PI / N; if (complement) piN= -piN; //find x_[n] = x[n]*e^--2*j*pi*n*n/N/2 = x[n]*e^j*piN*n*n x_= new ShortComplex[N_]; Complex v; int n; for(n= 0; n < N; ++n) { arg= piN*n*n; v.re= cosl(arg); v.im= sinl(arg); complex_mul(x_ + n, x + n, &v); } for(; n < N_; ++n) x_[n].re= x_[n].im= 0; //find w[n] = e^-j*2*pi*(2*N-2-n)^2/N/2= e^-j*piN*(2*N-2-n)^2 w= new ShortComplex[N_]; int N22= 2*N - 2; for(n= 0; n < N_; ++n, --N22) { arg= -piN*N22*N22; w[n].re= (double)cos(arg); w[n].im= (double)sin(arg); } //FFT1 Wstore= createWstore(N_); fft_step(x_, T, false, Wstore); //FFT2 fft_step(w, T, false, Wstore); //svertka for(n= 0; n < N_; ++n) x_[n]*= w[n]; //FFT3 (complement) fft_step(x_, T, true, Wstore); //find X[n] = X_[n]*e^--j*2*pi*n*n/N/2 = X_[n]*e^j*piN*n*n for(n= 0, N22= 2*N - 2; n < N; ++n, --N22) { arg= piN*n*n; v.re= cosl(arg); v.im= sinl(arg); v/= N_; if (complement) v/= N; complex_mul(x + n, x_ + N22, &v); } delete [] x_; delete [] w; delete [] Wstore; }
Оптимизированный вариант fft_step:
static void fft_step(ShortComplex *x, unsigned int T, bool complement, const ShortComplex *Wstore) { unsigned int Nmax, I, J, N, Nd2, N2, k, Skew, Step; unsigned char *Ic= (unsigned char*) &I; unsigned char *Jc= (unsigned char*) &J; ShortComplex S; Nmax= 1 << T; double cmul= complement ? -1.0 : +1.0; //first interchanging for(I = 1; I < Nmax - 1; I++) { Jc[0]= reverse256[Ic[3]]; Jc[1]= reverse256[Ic[2]]; Jc[2]= reverse256[Ic[1]]; Jc[3]= reverse256[Ic[0]]; J >>= (32 - T); if (I < J) { S= x[I]; x[I]= x[J]; x[J]= S; } } double *Warray; double Wre, Wim; double Tre, Tim; double *arr= (double*)x; double *arrEnd= arr + (Nmax + Nmax); //main loop for(N= 2, Skew= Nmax, Step= 1; N <= Nmax; N += N, Skew >>= 1, Step++) { N2= N + N; Nd2= (N >> 1); for(Warray= (double*)Wstore, k= 0; k < Nd2; k++, Warray += Skew) { Wre= *Warray; Wim= cmul*Warray[1]; for(double *x1re= arr + (k + k); x1re < arrEnd; x1re+= N2) { double *x1im= x1re + 1; double *x2re= x1re + N; double *x2im= x2re + 1; Tre = Wre * *x2re - Wim * *x2im; Tim = Wre * *x2im + Wim * *x2re; *x2re= *x1re - Tre; *x2im= *x1im - Tim; *x1re+= Tre; *x1im+= Tim; } } } }