📄 img_idct_8x8_12q4.h64
字号:
* /* The outer loop steps between IDCT blocks, whereas the */ *
* /* inner loop focuses on rows within each IDCT block. */ *
* /* -------------------------------------------------------- */ *
* for (i = num_idcts - 1; i >= 0; i--) *
* { *
* for (j = 0; j < 8; j++) *
* { *
* /* ------------------------------------------------ */ *
* /* Stage 0: Load in freq.-domain coefficients. */ *
* /* ------------------------------------------------ */ *
* F0 = idct[i][j][0]; *
* F1 = idct[i][j][1]; *
* F2 = idct[i][j][2]; *
* F3 = idct[i][j][3]; *
* F4 = idct[i][j][4]; *
* F5 = idct[i][j][5]; *
* F6 = idct[i][j][6]; *
* F7 = idct[i][j][7]; *
* *
* /* ------------------------------------------------ */ *
* /* Stage 1 of signal flow graph. */ *
* /* ------------------------------------------------ */ *
* P0 = F0; P1 = F4; *
* R1 = F2; R0 = F6; *
* *
* Q1 = (F1*C7 - F7*C1 + 0x8000) >> 16; *
* Q0 = (F5*C3 - F3*C5 + 0x8000) >> 16; *
* S0 = (F5*C5 + F3*C3 + 0x8000) >> 16; *
* S1 = (F1*C1 + F7*C7 + 0x8000) >> 16; *
* *
* /* ------------------------------------------------ */ *
* /* Stage 2 of signal flow graph. */ *
* /* ------------------------------------------------ */ *
* p0 = (((int)P0 + (int)P1 + 1) >> 1) + 15; *
* p1 = (((int)P0 - (int)P1 ) >> 1) + 16; *
* r1 = (R1*C6 - R0*C2 + 0x8000) >> 16; *
* r0 = (R1*C2 + R0*C6 + 0x8000) >> 16; *
* *
* s1 = (S1 + S0); q1 = (Q1 + Q0); *
* s0 = (S1 - S0); q0 = (Q1 - Q0); *
* *
* /* ------------------------------------------------ */ *
* /* Stage 3 of signal flow graph. */ *
* /* ------------------------------------------------ */ *
* g0 = (p0 + r0); g1 = (p1 + r1); *
* h0 = (p0 - r0); h1 = (p1 - r1); *
* *
* h2 = s1; g2 = q1; *
* g3 = (s0*C0 - q0*C0 + 0x8000) >> 16; *
* h3 = (s0*C0 + q0*C0 + 0x8000) >> 16; *
* *
* /* ------------------------------------------------ */ *
* /* Stage 4 of signal flow graph. */ *
* /* ------------------------------------------------ */ *
* f0 = (g0 + h2); f7 = (g0 - h2); *
* f1 = (g1 + h3); f6 = (g1 - h3); *
* f2 = (h1 + g3); f5 = (h1 - g3); *
* f3 = (h0 + g2); f4 = (h0 - g2); *
* *
* /* ------------------------------------------------ */ *
* /* Stage 4.1: Q-pt adjust: Bit 15 is don't-care. */ *
* /* ------------------------------------------------ */ *
* f0r = f0 + f0; f7r = f7 + f7; *
* f1r = f1 + f1; f6r = f6 + f6; *
* f2r = f2 + f2; f5r = f5 + f5; *
* f3r = f3 + f3; f4r = f4 + f4; *
* *
* /* ------------------------------------------------ */ *
* /* Stage 4.2: Saturate results to 9Q6. */ *
* /* ------------------------------------------------ */ *
* f0s = f0r>0x3FFF?0x3FFF: f0r<-0x4000?-0x4000 : f0r; *
* f1s = f1r>0x3FFF?0x3FFF: f1r<-0x4000?-0x4000 : f1r; *
* f2s = f2r>0x3FFF?0x3FFF: f2r<-0x4000?-0x4000 : f2r; *
* f3s = f3r>0x3FFF?0x3FFF: f3r<-0x4000?-0x4000 : f3r; *
* f4s = f4r>0x3FFF?0x3FFF: f4r<-0x4000?-0x4000 : f4r; *
* f5s = f5r>0x3FFF?0x3FFF: f5r<-0x4000?-0x4000 : f5r; *
* f6s = f6r>0x3FFF?0x3FFF: f6r<-0x4000?-0x4000 : f6r; *
* f7s = f7r>0x3FFF?0x3FFF: f7r<-0x4000?-0x4000 : f7r; *
* *
* /* ------------------------------------------------ */ *
* /* Stage 4.3: Truncate results to 9Q0. */ *
* /* ------------------------------------------------ */ *
* f0t = f0s >> 6; f7t = f7s >> 6; *
* f1t = f1s >> 6; f6t = f6s >> 6; *
* f2t = f2s >> 6; f5t = f5s >> 6; *
* f3t = f3s >> 6; f4t = f4s >> 6; *
* *
* /* ------------------------------------------------ */ *
* /* Stage 5: Store sample-domain results. */ *
* /* ------------------------------------------------ */ *
* idct[i][j][0] = f0t; *
* idct[i][j][1] = f1t; *
* idct[i][j][2] = f2t; *
* idct[i][j][3] = f3t; *
* idct[i][j][4] = f4t; *
* idct[i][j][5] = f5t; *
* idct[i][j][6] = f6t; *
* idct[i][j][7] = f7t; *
* } *
* } *
* *
* return; *
* } *
* *
* Note: This code guarantees correct operation, even in the case *
* that 'num_idcts == 0'. In that case, the function runs for only *
* 13 cycles (counting 6 cycles of function-call overhead), due to *
* early-exit code. Also, the assembly code imposes additional data *
* alignment restrictions that are not present in the C code above. *
* *
* TECHNIQUES *
* All levels of looping are collapsed into single loops which are *
* pipelined. The outer loop focuses on 8-pt IDCTs, whereas the *
* inner loop controls the column-pointer to handle jumps between *
* IDCT blocks. (The column-pointer adjustment is handled by a *
* four-phase rotating "fixup" constant which takes the place of *
* the original inner-loop.) *
* *
* For performance, portions of the outer-loop code have been *
* inter-scheduled with the prologs and epilogs of both loops. *
* Finally, cosine term registers are reused between the horizontal *
* and vertical loops to save the need for reinitialization. *
* *
* To save codesize, prolog and epilog collapsing have been performed *
* to the extent that performance is not affected. The remaining *
* prolog and epilog code has been interscheduled with code outside *
* the loops to improve performance. *
* *
* Additional section-specific optimization notes are provided below. *
* *
* ASSUMPTIONS *
* This is a LITTLE ENDIAN implementation. *
* *
* The input array must be aligned on a double-word boundary. *
* *
* MEMORY NOTE *
* No bank conflicts occur. *
* The input array must be aligned on a double-word boundary. *
* *
* Bank usage for N 32-bit banks: *
* *
* Vert loop accesses: 2 of N banks for 54% of cycles *
* 1 of N banks for 36% of cycles *
* 0 of N banks for 9% of cycles *
* *
* Horiz loop accesses: 4 of N banks for 16% of cycles *
* 2 of N banks for 33% of cycles *
* 0 of N banks for 50% of cycles *
* *
* The code may perform speculative reads of up to 128 bytes *
* beyond the end or before the start of the IDCT array. The *
* speculatively accessed data is ignored. *
* *
* NOTES *
* This is fully interruptable and fully reentrant. *
* *
* The cosine terms have all been scaled by sqrt(2), so that the *
* "c4" term is basically an even power of 2. *
* *
* CYCLES *
* cycles = 62 + 92 * num_idcts, for num_idcts > 0 *
* cycles = 13, for num_idcts == 0. *
* *
* For num_idcts = 6, cycles = 614. *
* For num_idcts = 24, cycles = 2270. *
* *
* CODESIZE *
* 968 bytes *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.global _IMG_idct_8x8_12q4
* ========================================================================= *
* End of file: img_idct_8x8_12q4.h64 *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -