📄 img_fdct_8x8.h
字号:
/* r0 = g0 - h0; // Results in Q2.5 // */
/* r1 = g1 - h1; // Results in Q2.5 // */
/* */
/* q1a= (g2 * C4 + 0x8000) >> 16; // q1a now in Q2 // */
/* s1a= (h2 * C4 + 0x8000) >> 16; // s1a now in Q2 // */
/* q1 = q1a + q1a; // Results in Q3 // */
/* s1 = s1a + s1a; // Results in Q3 // */
/* */
/* s0 = h3 + g3; // Results in Q3 // */
/* q0 = h3 - g3; // Results in Q3 // */
/* */
/* // ------------------------------------------------ // */
/* // Stage 3 // */
/* // // */
/* // Now, the even-half becomes Q0. This happens // */
/* // on P0 and P1 because the multiply-by-c4 was // */
/* // canceled with an upward scaling by sqrt(2), // */
/* // yielding Q3 intermediate values. The final // */
/* // >> 3 leaves these at Q0. On R0 and R1, this // */
/* // happens because c2 and c6 are at Q13.5, // */
/* // yielding Q16 intermediate values. The final // */
/* // >> 16 then leaves those values at Q0. // */
/* // ------------------------------------------------ // */
/* P0 = ((short)(p0 + p1)) >> 3; // Results in Q0 // */
/* P1 = ((short)(p0 - p1)) >> 3; // Results in Q0 // */
/* R1 = (c6 * r1 + c2 * r0 + 0x8000) >> 16; // .. Q0 // */
/* R0 = (c6 * r0 - c2 * r1 + 0x8000) >> 16; // .. Q0 // */
/* */
/* Q1 = q1 + q0; // Results in Q3 // */
/* Q0 = q1 - q0; // Results in Q3 // */
/* S1 = s1 + s0; // Results in Q3 // */
/* S0 = s1 - s0; // Results in Q3 // */
/* */
/* // ------------------------------------------------ // */
/* // Stage 4 // */
/* // // */
/* // Next, the odd-half ends up in Q0. This happens // */
/* // because our values are in Q3 and our cosine // */
/* // terms are in Q13, giving us Q16 intermediate // */
/* // values. The final >> 16 leaves us a Q0 result. // */
/* // ------------------------------------------------ // */
/* F0 = P0; // Results in Q0 // */
/* F4 = P1; // Results in Q0 // */
/* F2 = R1; // Results in Q0 // */
/* F6 = R0; // Results in Q0 // */
/* */
/* F1 = (c7 * Q1 + c1 * S1 + 0x8000) >> 16; // .. Q0 // */
/* F7 = (c7 * S1 - c1 * Q1 + 0x8000) >> 16; // .. Q0 // */
/* F5 = (c3 * Q0 + c5 * S0 + 0x8000) >> 16; // .. Q0 // */
/* F3 = (c3 * S0 - c5 * Q0 + 0x8000) >> 16; // .. Q0 // */
/* */
/* // ------------------------------------------------ // */
/* // Store the results // */
/* // ------------------------------------------------ // */
/* dct_io_ptr[0] = F0; */
/* dct_io_ptr[1] = F1; */
/* dct_io_ptr[2] = F2; */
/* dct_io_ptr[3] = F3; */
/* dct_io_ptr[4] = F4; */
/* dct_io_ptr[5] = F5; */
/* dct_io_ptr[6] = F6; */
/* dct_io_ptr[7] = F7; */
/* */
/* // ------------------------------------------------ // */
/* // Update pointer to next FDCT row. // */
/* // ------------------------------------------------ // */
/* dct_io_ptr += 8; */
/* } */
/* */
/* return; */
/* } */
/* */
/* */
/* TECHNIQUES */
/* The loop nest in the vertical pass has been collapsed into a */
/* single-level loop. Both vertical and horizontal loops have */
/* been software pipelined. */
/* */
/* For performance, portions of the code outside the loops have been */
/* inter-scheduled with the prolog and epilog code of the loops. */
/* Also, twin stack-pointers are used to accelerate stack accesses. */
/* Finally, pointer values and cosine term registers are reused */
/* between the horizontal and vertical loops to reduce the impact of */
/* pointer and constant reinitialization. */
/* */
/* To save codesize, prolog and epilog collapsing have been performed */
/* to the extent that it does not impact performance. */
/* */
/* To reduce register pressure and save some code, the horizontal */
/* loop uses the same pair of pointer register for both reading and */
/* writing. The pointer increments are on the LDs to permit prolog */
/* and epilog collapsing, since LDs can be speculated. */
/* */
/* ASSUMPTIONS */
/* Stack is aligned to a double-word boundary. */
/* */
/* MEMORY NOTE */
/* No bank conflicts occur. */
/* */
/* The "fdct_data[]" array must be aligned on a double-word (8 byte) */
/* boundary. */
/* */
/* The code requires 4 words of stack space to save Save-On-Entry */
/* (SOE) registers. */
/* */
/* Nearly every cycle of this function performs at least one */
/* memory access. */
/* */
/* NOTES */
/* This code is fully interruptible. Interrupts are blocked only */
/* branch delay slots. */
/* */
/* The cosine terms have all been scaled by sqrt(2), so that the */
/* "c4" term is basically an even power of 2. */
/* */
/* The code is LITTLE ENDIAN. */
/* */
/* CYCLES */
/* cycles = 50 + 76 * num_fdcts */
/* */
/* For num_fdcts = 6, cycles = 506. */
/* For num_fdcts = 24, cycles = 1848. */
/* */
/* CODESIZE */
/* 980 bytes. */
/* */
/* SOURCE */
/* Chen FDCT. */
/* */
/* ------------------------------------------------------------------------ */
/* Copyright (c) 2003 Texas Instruments, Incorporated. */
/* All Rights Reserved. */
/* ======================================================================== */
#ifndef IMG_FDCT_8X8_H_
#define IMG_FDCT_8X8_H_ 1
void IMG_fdct_8x8(short fdct_data[], unsigned num_fdcts);
#endif
/* ======================================================================== */
/* End of file: img_fdct_8x8.h */
/* ------------------------------------------------------------------------ */
/* Copyright (c) 2003 Texas Instruments, Incorporated. */
/* All Rights Reserved. */
/* ======================================================================== */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -