📄 fdct_8x8.h62

📁 DSP GEPE 压缩算法
💻 H62
📖 第 1 页 / 共 2 页
字号:
上一页 12
*             f3 = dct_io_ptr[3];                                           *
*             f4 = dct_io_ptr[4];                                           *
*             f5 = dct_io_ptr[5];                                           *
*             f6 = dct_io_ptr[6];                                           *
*             f7 = dct_io_ptr[7];                                           *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 1:  Separate into even and odd halves.        */    *
*             /* ---------------------------------------------------- */    *
*             g0 = f0 + f7;               h2 = f0 - f7;                     *
*             g1 = f1 + f6;               h3 = f1 - f6;                     *
*             h1 = f2 + f5;               g3 = f2 - f5;                     *
*             h0 = f3 + f4;               g2 = f3 - f4;                     *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 2                                             */    *
*             /* ---------------------------------------------------- */    *
*             p0 = g0 + h0;               r0 = g0 - h0;                     *
*             p1 = g1 + h1;               r1 = g1 - h1;                     *
*             q1 = g2;                    s1 = h2;                          *
*                                                                           *
*             s0a= h3 + g3;               q0a= h3 - g3;                     *
*             q0 = (q0a * c0 + 0x7FFF) >> 16;                               *
*             s0 = (s0a * c0 + 0x7FFF) >> 16;                               *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 3                                             */    *
*             /* ---------------------------------------------------- */    *
*             P0 = p0 + p1;               P1 = p0 - p1;                     *
*             R1 = c6 * r1 + c2 * r0;     R0 = c6 * r0 - c2 * r1;           *
*                                                                           *
*             Q1 = q1 + q0;               Q0 = q1 - q0;                     *
*             S1 = s1 + s0;               S0 = s1 - s0;                     *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 4                                             */    *
*             /* ---------------------------------------------------- */    *
*             F0 = P0;                    F4 = P1;                          *
*             F2 = R1;                    F6 = R0;                          *
*                                                                           *
*             F1 = c7 * Q1 + c1 * S1;     F7 = c7 * S1 - c1 * Q1;           *
*             F5 = c3 * Q0 + c5 * S0;     F3 = c3 * S0 - c5 * Q0;           *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Round and truncate values.                          */    *
*             /*                                                      */    *
*             /*  Note: F0 and F4 have different rounding since no    */    *
*             /*  MPYs have been applied to either term.  Also, F0's  */    *
*             /*  rounding is slightly different to offset the        */    *
*             /*  truncation effects from the horizontal pass (which  */    *
*             /*  does not round).                                    */    *
*             /* ---------------------------------------------------- */    *
*             F0r = (F0 + 0x0006) >>  3;                                    *
*             F1r = (F1 + 0x7FFF) >> 16;                                    *
*             F2r = (F2 + 0x7FFF) >> 16;                                    *
*             F3r = (F3 + 0x7FFF) >> 16;                                    *
*             F4r = (F4 + 0x0004) >>  3;                                    *
*             F5r = (F5 + 0x7FFF) >> 16;                                    *
*             F6r = (F6 + 0x7FFF) >> 16;                                    *
*             F7r = (F7 + 0x7FFF) >> 16;                                    *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Store the results                                   */    *
*             /* ---------------------------------------------------- */    *
*             dct_io_ptr[0] = F0r;                                          *
*             dct_io_ptr[1] = F1r;                                          *
*             dct_io_ptr[2] = F2r;                                          *
*             dct_io_ptr[3] = F3r;                                          *
*             dct_io_ptr[4] = F4r;                                          *
*             dct_io_ptr[5] = F5r;                                          *
*             dct_io_ptr[6] = F6r;                                          *
*             dct_io_ptr[7] = F7r;                                          *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Update pointer to next FDCT row.                    */    *
*             /* ---------------------------------------------------- */    *
*             dct_io_ptr += 8;                                              *
*         }                                                                 *
*                                                                           *
*         return;                                                           *
*       }                                                                   *
*                                                                           *
*                                                                           *
*       Note:  This code guarantees correct operation, even in the case     *
*       that 'num_fdcts == 0'.  In this case, the function runs for only    *
*       13 cycles (counting 6 cycles of function-call overhead), due to     *
*       early-exit code.  The early-exit case performs no accesses to the   *
*       fdct_data[] array and minimal access to the stack.                  *
*                                                                           *
*   TECHNIQUES                                                              *
*       The loop nest in the vertical pass has been collapsed into a        *
*       single-level loop.  Both vertical and horizontal loops have         *
*       been software pipelined.                                            *
*                                                                           *
*       For performance, portions of the code outside the loops have been   *
*       inter-scheduled with the prolog and epilog code of the loops.       *
*       Also, twin stack-pointers are used to accelerate stack accesses.    *
*       Finally, pointer values and cosine term registers are reused        *
*       between the horizontal and vertical loops to reduce the impact of   *
*       pointer and constant reinitialization.                              *
*                                                                           *
*       To save codesize, prolog and epilog collapsing have been performed  *
*       to the extent that it does not impact performance.  Also, code      *
*       outside the loops has been scheduled to pack as tightly into        *
*       fetch packets as possible to avoid alignment padding NOPs.          *
*                                                                           *
*       To reduce register pressure and save some code, the horizontal      *
*       loop uses the same pair of pointer register for both reading and    *
*       writing.  The pointer increments are on the LDs to permit prolog    *
*       and epilog collapsing, since LDs can be speculated.                 *
*                                                                           *
*       Additional section-specific optimization notes are provided below.  *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       Stack is aligned to a word boundary.                                *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No bank conflicts occur, regardless of fdct_data[]'s alignment.     *
*                                                                           *
*       The code requires 16 words of stack space to save Save-On-Entry     *
*       (SOE) registers, CSR, IRP, and a spill value.                       *
*                                                                           *
*       Bank usage on C6201:  1 of 4 banks for 40% of loop cycles           *
*                             2 of 4 banks for 60% of loop cycles           *
*                                                                           *
*       Nearly every cycle of this function performs at least one           *
*       memory access.                                                      *
*                                                                           *
*   NOTES                                                                   *
*       This code masks interrupts for nearly its entire duration.          *
*       Interrupts are locked out for '40 + 160 * num_fdcts' cycles.  As    *
*       a result, the code is interrupt-tolerant, but not interruptible.    *
*                                                                           *
*       The cosine terms have all been scaled by sqrt(2), so that the       *
*       "c4" term is basically an even power of 2.                          *
*                                                                           *
*       The code is completely ENDIAN NEUTRAL.                              *
*                                                                           *
*   CYCLES                                                                  *
*       cycles = 48 + 160 * num_fdcts                                       *
*                                                                           *
*       For num_fdcts =  6, cycles = 1008.                                  *
*       For num_fdcts = 24, cycles = 3888.                                  *
*                                                                           *
*   CODESIZE                                                                *
*       1216 bytes.                                                         *
*                                                                           *
*   SOURCE                                                                  *
*       Chen FDCT.                                                          *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

        ; void fdct_8x8_asm(short *fdct_data, unsigned num_fdcts)
        .global _fdct_8x8

* ========================================================================= *
*   End of file:  fdct_8x8.h62                                              *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -