speedy.c

来自「linux下的MPEG1」· C语言 代码 · 共 2,101 行 · 第 1/5 页

C
2,101
字号
{    int r1 = 0;    int r2 = 0;    data += 2;    width -= 1;    while( width-- ) {        int s1, s2;        s1 = *data + r1; r1 = *data;        s2 = s1    + r2; r2 = s1;        *(data - 2) = s2 >> 2;        data += 2;    }}static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int width ){    int r1 = 0;    int r2 = 0;    int r3 = 0;    int r4 = 0;    width -= 4;    data += 4;    while( width-- ) {        int s1, s2, s3, s4;        s1 = *data + r1; r1 = *data;        s2 = s1    + r2; r2 = s1;        s3 = s2    + r3; r3 = s2;        s4 = s3    + r4; r4 = s3;        *(data - 4) = s4 >> 4;        data += 2;    }}static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top,                                              uint8_t *bot, int width ){    int i;    for( i = width*2; i; --i ) {        *output++ = ((*top++) + (*bot++)) >> 1;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,                                                uint8_t *bot, int width ){    const mmx_t shiftmask = { 0xfefffefffefffeffULL };  /* To avoid shifting chroma to luma. */    int i;    for( i = width/16; i; --i ) {        movq_m2r( *bot, mm0 );        movq_m2r( *top, mm1 );        movq_m2r( *(bot + 8), mm2 );        movq_m2r( *(top + 8), mm3 );        movq_m2r( *(bot + 16), mm4 );        movq_m2r( *(top + 16), mm5 );        movq_m2r( *(bot + 24), mm6 );        movq_m2r( *(top + 24), mm7 );        pand_m2r( shiftmask, mm0 );        pand_m2r( shiftmask, mm1 );        pand_m2r( shiftmask, mm2 );        pand_m2r( shiftmask, mm3 );        pand_m2r( shiftmask, mm4 );        pand_m2r( shiftmask, mm5 );        pand_m2r( shiftmask, mm6 );        pand_m2r( shiftmask, mm7 );        psrlw_i2r( 1, mm0 );        psrlw_i2r( 1, mm1 );        psrlw_i2r( 1, mm2 );        psrlw_i2r( 1, mm3 );        psrlw_i2r( 1, mm4 );        psrlw_i2r( 1, mm5 );        psrlw_i2r( 1, mm6 );        psrlw_i2r( 1, mm7 );        paddb_r2r( mm1, mm0 );        paddb_r2r( mm3, mm2 );        paddb_r2r( mm5, mm4 );        paddb_r2r( mm7, mm6 );        movq_r2m( mm0, *output );        movq_r2m( mm2, *(output + 8) );        movq_r2m( mm4, *(output + 16) );        movq_r2m( mm6, *(output + 24) );        output += 32;        top += 32;        bot += 32;    }    width = (width & 0xf);    for( i = width/4; i; --i ) {        movq_m2r( *bot, mm0 );        movq_m2r( *top, mm1 );        pand_m2r( shiftmask, mm0 );        pand_m2r( shiftmask, mm1 );        psrlw_i2r( 1, mm0 );        psrlw_i2r( 1, mm1 );        paddb_r2r( mm1, mm0 );        movq_r2m( mm0, *output );        output += 8;        top += 8;        bot += 8;    }    width = width & 0x7;    /* Handle last few pixels. */    for( i = width * 2; i; --i ) {        *output++ = ((*top++) + (*bot++)) >> 1;    }    emms();}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top,                                                   uint8_t *bot, int width ){    int i;    for( i = width/16; i; --i ) {        movq_m2r( *bot, mm0 );        movq_m2r( *top, mm1 );        movq_m2r( *(bot + 8), mm2 );        movq_m2r( *(top + 8), mm3 );        movq_m2r( *(bot + 16), mm4 );        movq_m2r( *(top + 16), mm5 );        movq_m2r( *(bot + 24), mm6 );        movq_m2r( *(top + 24), mm7 );        pavgb_r2r( mm1, mm0 );        pavgb_r2r( mm3, mm2 );        pavgb_r2r( mm5, mm4 );        pavgb_r2r( mm7, mm6 );        movntq_r2m( mm0, *output );        movntq_r2m( mm2, *(output + 8) );        movntq_r2m( mm4, *(output + 16) );        movntq_r2m( mm6, *(output + 24) );        output += 32;        top += 32;        bot += 32;    }    width = (width & 0xf);    for( i = width/4; i; --i ) {        movq_m2r( *bot, mm0 );        movq_m2r( *top, mm1 );        pavgb_r2r( mm1, mm0 );        movntq_r2m( mm0, *output );        output += 8;        top += 8;        bot += 8;    }    width = width & 0x7;    /* Handle last few pixels. */    for( i = width * 2; i; --i ) {        *output++ = ((*top++) + (*bot++)) >> 1;    }    sfence();    emms();}#endifstatic void blit_colour_packed422_scanline_c( uint8_t *output, int width, int y, int cb, int cr ){    uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;    uint32_t *o = (uint32_t *) output;    for( width /= 2; width; --width ) {        *o++ = colour;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int y, int cb, int cr ){    uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;    int i;    movd_m2r( colour, mm1 );    movd_m2r( colour, mm2 );    psllq_i2r( 32, mm1 );    por_r2r( mm1, mm2 );    for( i = width / 16; i; --i ) {        movq_r2m( mm2, *output );        movq_r2m( mm2, *(output + 8) );        movq_r2m( mm2, *(output + 16) );        movq_r2m( mm2, *(output + 24) );        output += 32;    }    width = (width & 0xf);    for( i = width / 4; i; --i ) {        movq_r2m( mm2, *output );        output += 8;    }    width = (width & 0x7);    for( i = width / 2; i; --i ) {        *((uint32_t *) output) = colour;        output += 4;    }    if( width & 1 ) {        *output = y;        *(output + 1) = cb;    }    emms();}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, int y, int cb, int cr ){    uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;    int i;    movd_m2r( colour, mm1 );    movd_m2r( colour, mm2 );    psllq_i2r( 32, mm1 );    por_r2r( mm1, mm2 );    for( i = width / 16; i; --i ) {        movntq_r2m( mm2, *output );        movntq_r2m( mm2, *(output + 8) );        movntq_r2m( mm2, *(output + 16) );        movntq_r2m( mm2, *(output + 24) );        output += 32;    }    width = (width & 0xf);    for( i = width / 4; i; --i ) {        movntq_r2m( mm2, *output );        output += 8;    }    width = (width & 0x7);    for( i = width / 2; i; --i ) {        *((uint32_t *) output) = colour;        output += 4;    }    if( width & 1 ) {        *output = y;        *(output + 1) = cb;    }    sfence();    emms();}#endifstatic void blit_colour_packed4444_scanline_c( uint8_t *output, int width,                                               int alpha, int luma, int cb, int cr ){    int j;    for( j = 0; j < width; j++ ) {        *output++ = alpha;        *output++ = luma;        *output++ = cb;        *output++ = cr;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width,                                                 int alpha, int luma,                                                 int cb, int cr ){    uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;    int i;    movd_m2r( colour, mm1 );    movd_m2r( colour, mm2 );    psllq_i2r( 32, mm1 );    por_r2r( mm1, mm2 );    for( i = width / 8; i; --i ) {        movq_r2m( mm2, *output );        movq_r2m( mm2, *(output + 8) );        movq_r2m( mm2, *(output + 16) );        movq_r2m( mm2, *(output + 24) );        output += 32;    }    width = (width & 0x7);    for( i = width / 2; i; --i ) {        movq_r2m( mm2, *output );        output += 8;    }    width = (width & 0x1);    if( width ) {        *((uint32_t *) output) = colour;        output += 4;    }    emms();}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width,                                                    int alpha, int luma,                                                    int cb, int cr ){    uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;    int i;    movd_m2r( colour, mm1 );    movd_m2r( colour, mm2 );    psllq_i2r( 32, mm1 );    por_r2r( mm1, mm2 );    for( i = width / 8; i; --i ) {        movntq_r2m( mm2, *output );        movntq_r2m( mm2, *(output + 8) );        movntq_r2m( mm2, *(output + 16) );        movntq_r2m( mm2, *(output + 24) );        output += 32;    }    width = (width & 0x7);    for( i = width / 2; i; --i ) {        movntq_r2m( mm2, *output );        output += 8;    }    width = (width & 0x1);    if( width ) {        *((uint32_t *) output) = colour;        output += 4;    }    sfence();    emms();}#endif#define speedy_memcpy_c xine_fast_memcpy#define speedy_memcpy_mmx xine_fast_memcpy#define speedy_memcpy_mmxext xine_fast_memcpystatic void blit_packed422_scanline_c( uint8_t *dest, const uint8_t *src, int width ){    speedy_memcpy_c( dest, src, width*2 );}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_packed422_scanline_mmx( uint8_t *dest, const uint8_t *src, int width ){    speedy_memcpy_mmx( dest, src, width*2 );}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_packed422_scanline_mmxext( uint8_t *dest, const uint8_t *src, int width ){    speedy_memcpy_mmxext( dest, src, width*2 );}#endifstatic void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output, uint8_t *input,                                                                uint8_t *foreground, int width, int alpha ){    int i;    for( i = 0; i < width; i++ ) {        int af = foreground[ 0 ];        if( af ) {            int a = ((af * alpha) + 0x80) >> 8;            if( a == 0xff ) {                output[ 0 ] = foreground[ 1 ];                if( ( i & 1 ) == 0 ) {                    output[ 1 ] = foreground[ 2 ];                    output[ 3 ] = foreground[ 3 ];                }            } else if( a ) {                /**                 * (1 - alpha)*B + alpha*F                 * (1 - af*a)*B + af*a*F                 *  B - af*a*B + af*a*F                 *  B + a*(af*F - af*B)                 */                output[ 0 ] = input[ 0 ]                            + ((alpha*( foreground[ 1 ]                                        - multiply_alpha( foreground[ 0 ], input[ 0 ] ) ) + 0x80) >> 8);                if( ( i & 1 ) == 0 ) {                    /**                     * At first I thought I was doing this incorrectly, but                     * the following math has convinced me otherwise.                     *                     * C_r = (1 - alpha)*B + alpha*F                     * C_r = B - af*a*B + af*a*F                     *                     * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128))                     * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128)                     * C_r = B - af*a*B + a*af*F                     */                    output[ 1 ] = input[ 1 ] + ((alpha*( foreground[ 2 ]                                            - multiply_alpha( foreground[ 0 ], input[ 1 ] ) ) + 0x80) >> 8);                    output[ 3 ] = input[ 3 ] + ((alpha*( foreground[ 3 ]                                            - multiply_alpha( foreground[ 0 ], input[ 3 ] ) ) + 0x80) >> 8);                }            }        }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?