📄 imdct.c
字号:
"shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" "mulps %%xmm0, %%xmm2 \n\t" "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" "subps %%xmm0, %%xmm2 \n\t" "movzbl (%%eax), %%edx \n\t" "movzbl 1(%%eax), %%ebp \n\t" "movlps %%xmm2, (%1, %%edx,8) \n\t" "movhps %%xmm2, (%1, %%ebp,8) \n\t" "addl $16, %%esi \n\t" "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap "subl $16, %%edi \n\t" " jnc 1b \n\t" "popl %%ebp \n\t"//no we didnt touch ebp *g* :: "b" (data), "c" (buf) : "%esi", "%edi", "%eax", "%edx" ); /* FFT Merge *//* unoptimized variant for (m=1; m < 7; m++) { if(m) two_m = (1 << m); else two_m = 1; two_m_plus_one = (1 << (m+1)); for(i = 0; i < 128; i += two_m_plus_one) { for(k = 0; k < two_m; k++) { p = k + i; q = p + two_m; tmp_a_r = buf[p].real; tmp_a_i = buf[p].imag; tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; buf[p].real = tmp_a_r + tmp_b_r; buf[p].imag = tmp_a_i + tmp_b_i; buf[q].real = tmp_a_r - tmp_b_r; buf[q].imag = tmp_a_i - tmp_b_i; } } }*/ /* 1. iteration */ // Note w[0][0]={1,0} asm volatile( "xorps %%xmm1, %%xmm1 \n\t" "xorps %%xmm2, %%xmm2 \n\t" "movl %0, %%esi \n\t" ".balign 16 \n\t" "1: \n\t" "movlps (%%esi), %%xmm0 \n\t" //buf[p] "movlps 8(%%esi), %%xmm1\n\t" //buf[q] "movhps (%%esi), %%xmm0 \n\t" //buf[p] "movhps 8(%%esi), %%xmm2\n\t" //buf[q] "addps %%xmm1, %%xmm0 \n\t" "subps %%xmm2, %%xmm0 \n\t" "movaps %%xmm0, (%%esi) \n\t" "addl $16, %%esi \n\t" "cmpl %1, %%esi \n\t" " jb 1b \n\t" :: "g" (buf), "r" (buf + 128) : "%esi" ); /* 2. iteration */ // Note w[1]={{1,0}, {0,-1}} asm volatile( "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 "movl %0, %%esi \n\t" ".balign 16 \n\t" "1: \n\t" "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 "addps %%xmm2, %%xmm0 \n\t" "subps %%xmm2, %%xmm1 \n\t" "movaps %%xmm0, (%%esi) \n\t" "movaps %%xmm1, 16(%%esi) \n\t" "addl $32, %%esi \n\t" "cmpl %1, %%esi \n\t" " jb 1b \n\t" :: "g" (buf), "r" (buf + 128) : "%esi" ); /* 3. iteration *//* Note sseW2+0={1,1,sqrt(2),sqrt(2)) Note sseW2+16={0,0,sqrt(2),-sqrt(2)) Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) Note sseW2+48={1,-1,sqrt(2),-sqrt(2))*/ asm volatile( "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" "xorps %%xmm5, %%xmm5 \n\t" "xorps %%xmm2, %%xmm2 \n\t" "movl %0, %%esi \n\t" ".balign 16 \n\t" "1: \n\t" "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 "mulps %%xmm2, %%xmm4 \n\t" "mulps %%xmm3, %%xmm5 \n\t" "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 "mulps %%xmm6, %%xmm3 \n\t" "mulps %%xmm7, %%xmm2 \n\t" "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 "addps %%xmm4, %%xmm2 \n\t" "addps %%xmm5, %%xmm3 \n\t" "movaps %%xmm2, %%xmm4 \n\t" "movaps %%xmm3, %%xmm5 \n\t" "addps %%xmm0, %%xmm2 \n\t" "addps %%xmm1, %%xmm3 \n\t" "subps %%xmm4, %%xmm0 \n\t" "subps %%xmm5, %%xmm1 \n\t" "movaps %%xmm2, (%%esi) \n\t" "movaps %%xmm3, 16(%%esi) \n\t" "movaps %%xmm0, 32(%%esi) \n\t" "movaps %%xmm1, 48(%%esi) \n\t" "addl $64, %%esi \n\t" "cmpl %1, %%esi \n\t" " jb 1b \n\t" :: "g" (buf), "r" (buf + 128) : "%esi" ); /* 4-7. iterations */ for (m=3; m < 7; m++) { two_m = (1 << m); two_m_plus_one = two_m<<1; asm volatile( "movl %0, %%esi \n\t" ".balign 16 \n\t" "1: \n\t" "xorl %%edi, %%edi \n\t" // k "leal (%%esi, %3), %%edx \n\t" "2: \n\t" "movaps (%%edx, %%edi), %%xmm1 \n\t" "movaps (%4, %%edi, 2), %%xmm2 \n\t" "mulps %%xmm1, %%xmm2 \n\t" "shufps $0xB1, %%xmm1, %%xmm1 \n\t" "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" "movaps (%%esi, %%edi), %%xmm0 \n\t" "addps %%xmm2, %%xmm1 \n\t" "movaps %%xmm1, %%xmm2 \n\t" "addps %%xmm0, %%xmm1 \n\t" "subps %%xmm2, %%xmm0 \n\t" "movaps %%xmm1, (%%esi, %%edi) \n\t" "movaps %%xmm0, (%%edx, %%edi) \n\t" "addl $16, %%edi \n\t" "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 " jb 2b \n\t" "addl %2, %%esi \n\t" "cmpl %1, %%esi \n\t" " jb 1b \n\t" :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3), "r" (sseW[m]) : "%esi", "%edi", "%edx" ); } /* Post IFFT complex multiply plus IFFT complex conjugate*/ asm volatile( "movl $-1024, %%esi \n\t" ".balign 16 \n\t" "1: \n\t" "movaps (%0, %%esi), %%xmm0 \n\t" "movaps (%0, %%esi), %%xmm1 \n\t" "shufps $0xB1, %%xmm0, %%xmm0 \n\t" "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t" "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" "addps %%xmm1, %%xmm0 \n\t" "movaps %%xmm0, (%0, %%esi) \n\t" "addl $16, %%esi \n\t" " jnz 1b \n\t" :: "r" (buf+128) : "%esi" ); data_ptr = data; delay_ptr = delay; window_ptr = imdct_window; /* Window and convert to real valued signal */ asm volatile( "xorl %%edi, %%edi \n\t" // 0 "xorl %%esi, %%esi \n\t" // 0 "movss %3, %%xmm2 \n\t" // bias "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... ".balign 16 \n\t" "1: \n\t" "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" "addps (%2, %%esi), %%xmm0 \n\t" "addps %%xmm2, %%xmm0 \n\t" "movaps %%xmm0, (%1, %%esi) \n\t" "addl $16, %%esi \n\t" "subl $16, %%edi \n\t" "cmpl $512, %%esi \n\t" " jb 1b \n\t" :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) : "%esi", "%edi" ); data_ptr+=128; delay_ptr+=128;// window_ptr+=128; asm volatile( "movl $1024, %%edi \n\t" // 512 "xorl %%esi, %%esi \n\t" // 0 "movss %3, %%xmm2 \n\t" // bias "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... ".balign 16 \n\t" "1: \n\t" "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" "addps (%2, %%esi), %%xmm0 \n\t" "addps %%xmm2, %%xmm0 \n\t" "movaps %%xmm0, (%1, %%esi) \n\t" "addl $16, %%esi \n\t" "subl $16, %%edi \n\t" "cmpl $512, %%esi \n\t" " jb 1b \n\t" :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) : "%esi", "%edi" ); data_ptr+=128;// window_ptr+=128; /* The trailing edge of the window goes into the delay line */ delay_ptr = delay; asm volatile( "xorl %%edi, %%edi \n\t" // 0 "xorl %%esi, %%esi \n\t" // 0 ".balign 16 \n\t" "1: \n\t" "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" "movaps %%xmm0, (%1, %%esi) \n\t" "addl $16, %%esi \n\t" "subl $16, %%edi \n\t" "cmpl $512, %%esi \n\t" " jb 1b \n\t" :: "r" (buf+64), "r" (delay_ptr) : "%esi", "%edi" ); delay_ptr+=128;// window_ptr-=128; asm volatile( "movl $1024, %%edi \n\t" // 1024 "xorl %%esi, %%esi \n\t" // 0 ".balign 16 \n\t" "1: \n\t" "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" "movaps %%xmm0, (%1, %%esi) \n\t" "addl $16, %%esi \n\t" "subl $16, %%edi \n\t" "cmpl $512, %%esi \n\t" " jb 1b \n\t" :: "r" (buf), "r" (delay_ptr) : "%esi", "%edi" );}#endif //arch_x86voidimdct_do_256(sample_t data[],sample_t delay[],sample_t bias){ int i,k; int p,q; int m; int two_m; int two_m_plus_one; sample_t tmp_a_i; sample_t tmp_a_r; sample_t tmp_b_i; sample_t tmp_b_r; sample_t *data_ptr; sample_t *delay_ptr; sample_t *window_ptr; complex_t *buf_1, *buf_2; buf_1 = &buf[0]; buf_2 = &buf[64]; /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ for(k=0; k<64; k++) { /* X1[k] = X[2*k] */ /* X2[k] = X[2*k+1] */ p = 2 * (128-2*k-1); q = 2 * (2 * k); /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */ buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k]; buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]); /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */ buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k]; buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]); } /* IFFT Bit reversed shuffling */ for(i=0; i<64; i++) { k = bit_reverse_256[i]; if (k < i) { swap_cmplx(&buf_1[i],&buf_1[k]); swap_cmplx(&buf_2[i],&buf_2[k]); } } /* FFT Merge */ for (m=0; m < 6; m++) { two_m = (1 << m); two_m_plus_one = (1 << (m+1)); /* FIXME */ if(m) two_m = (1 << m); else two_m = 1; for(k = 0; k < two_m; k++) { for(i = 0; i < 64; i += two_m_plus_one) { p = k + i; q = p + two_m; /* Do block 1 */ tmp_a_r = buf_1[p].real; tmp_a_i = buf_1[p].imag; tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag; tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag; buf_1[p].real = tmp_a_r + tmp_b_r; buf_1[p].imag = tmp_a_i + tmp_b_i; buf_1[q].real = tmp_a_r - tmp_b_r; buf_1[q].imag = tmp_a_i - tmp_b_i; /* Do block 2 */ tmp_a_r = buf_2[p].real; tmp_a_i = buf_2[p].imag; tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag; tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag; buf_2[p].real = tmp_a_r + tmp_b_r; buf_2[p].imag = tmp_a_i + tmp_b_i; buf_2[q].real = tmp_a_r - tmp_b_r; buf_2[q].imag = tmp_a_i - tmp_b_i; } } } /* Post IFFT complex multiply */ for( i=0; i < 64; i++) { /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */ tmp_a_r = buf_1[i].real; tmp_a_i = -buf_1[i].imag; buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]); buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]); /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */ tmp_a_r = buf_2[i].real; tmp_a_i = -buf_2[i].imag; buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]); buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]); } data_ptr = data; delay_ptr = delay;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -