📄 cfft.c
字号:
ah = i + k * ido; RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]); RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur); IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]); IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur); RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2); IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2); RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui); IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui); RE(d2) = RE(c2) + IM(c3); IM(d3) = IM(c2) + RE(c3); RE(d3) = RE(c2) - IM(c3); IM(d2) = IM(c2) - RE(c3);#if 1 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));#else ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));#endif } } } }}#ifdef USE_SSEALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 };__declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3){ __asm { push ebx mov ebx, esp and esp, -16 push edi push esi sub esp, 8 movzx edi, WORD PTR [ebx+8] movaps xmm1, XMMWORD PTR negate test edi, edi jle l1_is_zero lea esi, DWORD PTR [edi+edi] add esi, esi sub esi, edi add esi, esi add esi, esi add esi, esi mov eax, DWORD PTR [ebx+16] add esi, eax lea ecx, DWORD PTR [edi+edi] add ecx, ecx add ecx, ecx add ecx, ecx add ecx, eax lea edx, DWORD PTR [edi+edi] add edx, edx add edx, edx add edx, eax xor eax, eax mov DWORD PTR [esp], ebp mov ebp, DWORD PTR [ebx+12]fftloop: lea edi, DWORD PTR [eax+eax] add edi, edi movaps xmm2, XMMWORD PTR [ebp+edi*8] movaps xmm0, XMMWORD PTR [ebp+edi*8+16] movaps xmm7, XMMWORD PTR [ebp+edi*8+32] movaps xmm5, XMMWORD PTR [ebp+edi*8+48] movaps xmm6, xmm2 addps xmm6, xmm0 movaps xmm4, xmm1 xorps xmm4, xmm7 movaps xmm3, xmm1 xorps xmm3, xmm5 xorps xmm2, xmm1 xorps xmm0, xmm1 addps xmm7, xmm5 subps xmm2, xmm0 movaps xmm0, xmm6 shufps xmm0, xmm7, 68 subps xmm4, xmm3 shufps xmm6, xmm7, 238 movaps xmm5, xmm2 shufps xmm5, xmm4, 68 movaps xmm3, xmm0 addps xmm3, xmm6 shufps xmm2, xmm4, 187 subps xmm0, xmm6 movaps xmm4, xmm5 addps xmm4, xmm2 mov edi, DWORD PTR [ebx+16] movaps XMMWORD PTR [edi+eax*8], xmm3 subps xmm5, xmm2 movaps XMMWORD PTR [edx+eax*8], xmm4 movaps XMMWORD PTR [ecx+eax*8], xmm0 movaps XMMWORD PTR [esi+eax*8], xmm5 add eax, 2 movzx eax, ax movzx edi, WORD PTR [ebx+8] cmp eax, edi jl fftloop mov ebp, DWORD PTR [esp]l1_is_zero: add esp, 8 pop esi pop edi mov esp, ebx pop ebx ret }}#endif#if 0static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3){ uint16_t i, k, ac, ah; for (k = 0; k < l1; k++) { ac = 4*k*ido; ah = k*ido; for (i = 0; i < ido; i+=2) { __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16; __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23; __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30; __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0); m1 = _mm_load_ps(&RE(cc[ac+i])); m2 = _mm_load_ps(&RE(cc[ac+i+2*ido])); m3 = _mm_add_ps(m1, m2); m4 = _mm_sub_ps(m1, m2); n1 = _mm_load_ps(&RE(cc[ac+i+ido])); n2 = _mm_load_ps(&RE(cc[ac+i+3*ido])); n3 = _mm_add_ps(n1, n2); n4 = _mm_mul_ps(neg1, n1); n5 = _mm_mul_ps(neg1, n2); n6 = _mm_sub_ps(n4, n5); m5 = _mm_add_ps(m3, n3); n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1)); n8 = _mm_add_ps(m4, n7); m6 = _mm_sub_ps(m3, n3); n9 = _mm_sub_ps(m4, n7); _mm_store_ps(&RE(ch[ah+i]), m5);#if 0 static INLINE void ComplexMult(real_t *y1, real_t *y2, real_t x1, real_t x2, real_t c1, real_t c2) { *y1 = MUL_F(x1, c1) + MUL_F(x2, c2); *y2 = MUL_F(x2, c1) - MUL_F(x1, c2); } m7.0 = RE(c2)*RE(wa1[i]) m7.1 = IM(c2)*IM(wa1[i]) m7.2 = RE(c6)*RE(wa1[i+1]) m7.3 = IM(c6)*IM(wa1[i+1]) m8.0 = RE(c2)*IM(wa1[i]) m8.1 = IM(c2)*RE(wa1[i]) m8.2 = RE(c6)*IM(wa1[i+1]) m8.3 = IM(c6)*RE(wa1[i+1]) RE(0) = m7.0 - m7.1 IM(0) = m8.0 + m8.1 RE(1) = m7.2 - m7.3 IM(1) = m8.2 + m8.3 //// RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i]) IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i]) RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1]) IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])#endif w1 = _mm_load_ps(&RE(wa1[i])); w3 = _mm_load_ps(&RE(wa2[i])); w5 = _mm_load_ps(&RE(wa3[i])); w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1)); m7 = _mm_mul_ps(n8, w1); m15 = _mm_mul_ps(m6, w3); m23 = _mm_mul_ps(n9, w5); m8 = _mm_mul_ps(n8, w2); m16 = _mm_mul_ps(m6, w4); m24 = _mm_mul_ps(n9, w6); m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0)); m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0)); m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1)); m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1)); m11 = _mm_add_ps(m9, m10); m19 = _mm_add_ps(m17, m18); m27 = _mm_add_ps(m25, m26); m12 = _mm_sub_ps(m9, m10); m20 = _mm_sub_ps(m17, m18); m28 = _mm_sub_ps(m25, m26); m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2)); m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2)); m14 = _mm_unpacklo_ps(m12, m13); m22 = _mm_unpacklo_ps(m20, m21); m30 = _mm_unpacklo_ps(m28, m29); _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22); _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30); } }}#endifstatic void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3){ uint16_t i, k, ac, ah; if (ido == 1) { for (k = 0; k < l1; k++) { complex_t t1, t2, t3, t4; ac = 4*k; ah = k; RE(t2) = RE(cc[ac]) + RE(cc[ac+2]); RE(t1) = RE(cc[ac]) - RE(cc[ac+2]); IM(t2) = IM(cc[ac]) + IM(cc[ac+2]); IM(t1) = IM(cc[ac]) - IM(cc[ac+2]); RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]); IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]); IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]); RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]); RE(ch[ah]) = RE(t2) + RE(t3); RE(ch[ah+2*l1]) = RE(t2) - RE(t3); IM(ch[ah]) = IM(t2) + IM(t3); IM(ch[ah+2*l1]) = IM(t2) - IM(t3); RE(ch[ah+l1]) = RE(t1) + RE(t4); RE(ch[ah+3*l1]) = RE(t1) - RE(t4); IM(ch[ah+l1]) = IM(t1) + IM(t4); IM(ch[ah+3*l1]) = IM(t1) - IM(t4); } } else { for (k = 0; k < l1; k++) { ac = 4*k*ido; ah = k*ido; for (i = 0; i < ido; i++) { complex_t c2, c3, c4, t1, t2, t3, t4; RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]); RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]); IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]); IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]); RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]); IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]); IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]); RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]); RE(c2) = RE(t1) + RE(t4); RE(c4) = RE(t1) - RE(t4); IM(c2) = IM(t1) + IM(t4); IM(c4) = IM(t1) - IM(t4); RE(ch[ah+i]) = RE(t2) + RE(t3); RE(c3) = RE(t2) - RE(t3); IM(ch[ah+i]) = IM(t2) + IM(t3); IM(c3) = IM(t2) - IM(t3);#if 1 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]), IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i])); ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]), IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));#else ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]), RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i])); ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]), RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));#endif } } }}static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3){ uint16_t i, k, ac, ah; if (ido == 1) { for (k = 0; k < l1; k++) { complex_t t1, t2, t3, t4; ac = 4*k; ah = k; RE(t2) = RE(cc[ac]) + RE(cc[ac+2]); RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -