📄 fdct_8x8.h62
字号:
* f3 = dct_io_ptr[3]; *
* f4 = dct_io_ptr[4]; *
* f5 = dct_io_ptr[5]; *
* f6 = dct_io_ptr[6]; *
* f7 = dct_io_ptr[7]; *
* *
* /* ---------------------------------------------------- */ *
* /* Stage 1: Separate into even and odd halves. */ *
* /* ---------------------------------------------------- */ *
* g0 = f0 + f7; h2 = f0 - f7; *
* g1 = f1 + f6; h3 = f1 - f6; *
* h1 = f2 + f5; g3 = f2 - f5; *
* h0 = f3 + f4; g2 = f3 - f4; *
* *
* /* ---------------------------------------------------- */ *
* /* Stage 2 */ *
* /* ---------------------------------------------------- */ *
* p0 = g0 + h0; r0 = g0 - h0; *
* p1 = g1 + h1; r1 = g1 - h1; *
* q1 = g2; s1 = h2; *
* *
* s0a= h3 + g3; q0a= h3 - g3; *
* q0 = (q0a * c0 + 0x7FFF) >> 16; *
* s0 = (s0a * c0 + 0x7FFF) >> 16; *
* *
* /* ---------------------------------------------------- */ *
* /* Stage 3 */ *
* /* ---------------------------------------------------- */ *
* P0 = p0 + p1; P1 = p0 - p1; *
* R1 = c6 * r1 + c2 * r0; R0 = c6 * r0 - c2 * r1; *
* *
* Q1 = q1 + q0; Q0 = q1 - q0; *
* S1 = s1 + s0; S0 = s1 - s0; *
* *
* /* ---------------------------------------------------- */ *
* /* Stage 4 */ *
* /* ---------------------------------------------------- */ *
* F0 = P0; F4 = P1; *
* F2 = R1; F6 = R0; *
* *
* F1 = c7 * Q1 + c1 * S1; F7 = c7 * S1 - c1 * Q1; *
* F5 = c3 * Q0 + c5 * S0; F3 = c3 * S0 - c5 * Q0; *
* *
* /* ---------------------------------------------------- */ *
* /* Round and truncate values. */ *
* /* */ *
* /* Note: F0 and F4 have different rounding since no */ *
* /* MPYs have been applied to either term. Also, F0's */ *
* /* rounding is slightly different to offset the */ *
* /* truncation effects from the horizontal pass (which */ *
* /* does not round). */ *
* /* ---------------------------------------------------- */ *
* F0r = (F0 + 0x0006) >> 3; *
* F1r = (F1 + 0x7FFF) >> 16; *
* F2r = (F2 + 0x7FFF) >> 16; *
* F3r = (F3 + 0x7FFF) >> 16; *
* F4r = (F4 + 0x0004) >> 3; *
* F5r = (F5 + 0x7FFF) >> 16; *
* F6r = (F6 + 0x7FFF) >> 16; *
* F7r = (F7 + 0x7FFF) >> 16; *
* *
* /* ---------------------------------------------------- */ *
* /* Store the results */ *
* /* ---------------------------------------------------- */ *
* dct_io_ptr[0] = F0r; *
* dct_io_ptr[1] = F1r; *
* dct_io_ptr[2] = F2r; *
* dct_io_ptr[3] = F3r; *
* dct_io_ptr[4] = F4r; *
* dct_io_ptr[5] = F5r; *
* dct_io_ptr[6] = F6r; *
* dct_io_ptr[7] = F7r; *
* *
* /* ---------------------------------------------------- */ *
* /* Update pointer to next FDCT row. */ *
* /* ---------------------------------------------------- */ *
* dct_io_ptr += 8; *
* } *
* *
* return; *
* } *
* *
* *
* Note: This code guarantees correct operation, even in the case *
* that 'num_fdcts == 0'. In this case, the function runs for only *
* 13 cycles (counting 6 cycles of function-call overhead), due to *
* early-exit code. The early-exit case performs no accesses to the *
* fdct_data[] array and minimal access to the stack. *
* *
* TECHNIQUES *
* The loop nest in the vertical pass has been collapsed into a *
* single-level loop. Both vertical and horizontal loops have *
* been software pipelined. *
* *
* For performance, portions of the code outside the loops have been *
* inter-scheduled with the prolog and epilog code of the loops. *
* Also, twin stack-pointers are used to accelerate stack accesses. *
* Finally, pointer values and cosine term registers are reused *
* between the horizontal and vertical loops to reduce the impact of *
* pointer and constant reinitialization. *
* *
* To save codesize, prolog and epilog collapsing have been performed *
* to the extent that it does not impact performance. Also, code *
* outside the loops has been scheduled to pack as tightly into *
* fetch packets as possible to avoid alignment padding NOPs. *
* *
* To reduce register pressure and save some code, the horizontal *
* loop uses the same pair of pointer register for both reading and *
* writing. The pointer increments are on the LDs to permit prolog *
* and epilog collapsing, since LDs can be speculated. *
* *
* Additional section-specific optimization notes are provided below. *
* *
* ASSUMPTIONS *
* Stack is aligned to a word boundary. *
* *
* MEMORY NOTE *
* No bank conflicts occur, regardless of fdct_data[]'s alignment. *
* *
* The code requires 16 words of stack space to save Save-On-Entry *
* (SOE) registers, CSR, IRP, and a spill value. *
* *
* Bank usage on C6201: 1 of 4 banks for 40% of loop cycles *
* 2 of 4 banks for 60% of loop cycles *
* *
* Nearly every cycle of this function performs at least one *
* memory access. *
* *
* NOTES *
* This code masks interrupts for nearly its entire duration. *
* Interrupts are locked out for '40 + 160 * num_fdcts' cycles. As *
* a result, the code is interrupt-tolerant, but not interruptible. *
* *
* The cosine terms have all been scaled by sqrt(2), so that the *
* "c4" term is basically an even power of 2. *
* *
* The code is completely ENDIAN NEUTRAL. *
* *
* CYCLES *
* cycles = 48 + 160 * num_fdcts *
* *
* For num_fdcts = 6, cycles = 1008. *
* For num_fdcts = 24, cycles = 3888. *
* *
* CODESIZE *
* 1216 bytes. *
* *
* SOURCE *
* Chen FDCT. *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 1999 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
; void fdct_8x8_asm(short *fdct_data, unsigned num_fdcts)
.global _fdct_8x8
* ========================================================================= *
* End of file: fdct_8x8.h62 *
* ------------------------------------------------------------------------- *
* Copyright (c) 1999 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -