speedy.c

来自「linux下的MPEG1」· C语言 代码 · 共 2,101 行 · 第 1/5 页

C
2,101
字号
        foreground += 4;        output += 2;        input += 2;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *output,                                                                     uint8_t *input,                                                                     uint8_t *foreground,                                                                     int width, int alpha ){    const mmx_t alpha2 = { 0x0000FFFF00000000ULL };    const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };    const mmx_t round  = { 0x0080008000800080ULL };    int i;    if( !alpha ) {        blit_packed422_scanline( output, input, width );        return;    }    if( alpha == 256 ) {        composite_packed4444_to_packed422_scanline( output, input, foreground, width );        return;    }    READ_PREFETCH_2048( input );    READ_PREFETCH_2048( foreground );    movq_m2r( alpha, mm2 );    pshufw_r2r( mm2, mm2, 0 );    pxor_r2r( mm7, mm7 );    for( i = width/2; i; i-- ) {        int fg1 = *((uint32_t *) foreground);        int fg2 = *(((uint32_t *) foreground)+1);        if( fg1 || fg2 ) {            /* mm1 = [ cr ][ y ][ cb ][ y ] */            movd_m2r( *input, mm1 );            punpcklbw_r2r( mm7, mm1 );            movq_m2r( *foreground, mm3 );            movq_r2r( mm3, mm4 );            punpcklbw_r2r( mm7, mm3 );            punpckhbw_r2r( mm7, mm4 );            /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */            pshufw_r2r( mm3, mm5, 0 );            pshufw_r2r( mm4, mm6, 0 );            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ]  == 11001000 == 201 */            pshufw_r2r( mm3, mm3, 201 );            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ]  == 00010000 == 16 */            pshufw_r2r( mm4, mm4, 16 );            pand_m2r( alpha1, mm3 );            pand_m2r( alpha2, mm4 );            pand_m2r( alpha1, mm5 );            pand_m2r( alpha2, mm6 );            por_r2r( mm4, mm3 );            por_r2r( mm6, mm5 );            /* now, mm5 is af and mm1 is B.  Need to multiply them. */            pmullw_r2r( mm1, mm5 );            /* Multiply by appalpha. */            pmullw_r2r( mm2, mm3 );            paddw_m2r( round, mm3 );            psrlw_i2r( 8, mm3 );            /* Result is now B + F. */            paddw_r2r( mm3, mm1 );            /* Round up appropriately. */            paddw_m2r( round, mm5 );            /* mm6 contains our i>>8; */            movq_r2r( mm5, mm6 );            psrlw_i2r( 8, mm6 );            /* Add mm6 back into mm5.  Now our result is in the high bytes. */            paddw_r2r( mm6, mm5 );            /* Shift down. */            psrlw_i2r( 8, mm5 );            /* Multiply by appalpha. */            pmullw_r2r( mm2, mm5 );            paddw_m2r( round, mm5 );            psrlw_i2r( 8, mm5 );            psubusw_r2r( mm5, mm1 );            /* mm1 = [ B + F - af*B ] */            packuswb_r2r( mm1, mm1 );            movd_r2m( mm1, *output );        }        foreground += 8;        output += 4;        input += 4;    }    sfence();    emms();}#endifstatic void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8_t *input,                                                          uint8_t *foreground, int width ){    int i;    for( i = 0; i < width; i++ ) {        int a = foreground[ 0 ];        if( a == 0xff ) {            output[ 0 ] = foreground[ 1 ];            if( ( i & 1 ) == 0 ) {                output[ 1 ] = foreground[ 2 ];                output[ 3 ] = foreground[ 3 ];            }        } else if( a ) {            /**             * (1 - alpha)*B + alpha*F             *  B + af*F - af*B             */            output[ 0 ] = input[ 0 ] + foreground[ 1 ] - multiply_alpha( foreground[ 0 ], input[ 0 ] );            if( ( i & 1 ) == 0 ) {                /**                 * C_r = (1 - af)*B + af*F                 * C_r = B - af*B + af*F                 */                output[ 1 ] = input[ 1 ] + foreground[ 2 ] - multiply_alpha( foreground[ 0 ], input[ 1 ] );                output[ 3 ] = input[ 3 ] + foreground[ 3 ] - multiply_alpha( foreground[ 0 ], input[ 3 ] );            }        }        foreground += 4;        output += 2;        input += 2;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input,                                                               uint8_t *foreground, int width ){    const mmx_t alpha2 = { 0x0000FFFF00000000ULL };    const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };    const mmx_t round  = { 0x0080008000800080ULL };    int i;    READ_PREFETCH_2048( input );    READ_PREFETCH_2048( foreground );    pxor_r2r( mm7, mm7 );    for( i = width/2; i; i-- ) {        int fg1 = *((uint32_t *) foreground);        int fg2 = *(((uint32_t *) foreground)+1);        if( (fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff ) {            movq_m2r( *foreground, mm3 );            movq_r2r( mm3, mm4 );            punpcklbw_r2r( mm7, mm3 );            punpckhbw_r2r( mm7, mm4 );            /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ]  == 11001000 == 201 */            pshufw_r2r( mm3, mm3, 201 );            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0  a ][ 1 y ][ 0  a ][ 0 a ]  == 00010000 == 16 */            pshufw_r2r( mm4, mm4, 16 );            pand_m2r( alpha1, mm3 );            pand_m2r( alpha2, mm4 );            por_r2r( mm4, mm3 );            /* mm1 = [ B + F - af*B ] */            packuswb_r2r( mm3, mm3 );            movd_r2m( mm3, *output );        } else if( fg1 || fg2 ) {            /* mm1 = [ cr ][ y ][ cb ][ y ] */            movd_m2r( *input, mm1 );            punpcklbw_r2r( mm7, mm1 );            movq_m2r( *foreground, mm3 );            movq_r2r( mm3, mm4 );            punpcklbw_r2r( mm7, mm3 );            punpckhbw_r2r( mm7, mm4 );            /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */            pshufw_r2r( mm3, mm5, 0 );            pshufw_r2r( mm4, mm6, 0 );            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ]  == 11001000 == 201 */            pshufw_r2r( mm3, mm3, 201 );            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0  a ][ 1 y ][ 0  a ][ 0 a ]  == 00010000 == 16 */            pshufw_r2r( mm4, mm4, 16 );            pand_m2r( alpha1, mm3 );            pand_m2r( alpha2, mm4 );            pand_m2r( alpha1, mm5 );            pand_m2r( alpha2, mm6 );            por_r2r( mm4, mm3 );            por_r2r( mm6, mm5 );            /* now, mm5 is af and mm1 is B.  Need to multiply them. */            pmullw_r2r( mm1, mm5 );            /* Result is now B + F. */            paddw_r2r( mm3, mm1 );            /* Round up appropriately. */            paddw_m2r( round, mm5 );            /* mm6 contains our i>>8; */            movq_r2r( mm5, mm6 );            psrlw_i2r( 8, mm6 );            /* Add mm6 back into mm5.  Now our result is in the high bytes. */            paddw_r2r( mm6, mm5 );            /* Shift down. */            psrlw_i2r( 8, mm5 );            psubusw_r2r( mm5, mm1 );            /* mm1 = [ B + F - af*B ] */            packuswb_r2r( mm1, mm1 );            movd_r2m( mm1, *output );        }        foreground += 8;        output += 4;        input += 4;    }    sfence();    emms();}#endif/** * um... just need some scrap paper... *   D = (1 - alpha)*B + alpha*F *   D = (1 - a)*B + a*textluma *     = B - a*B + a*textluma *     = B + a*(textluma - B) *   Da = (1 - a)*b + a */static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output,                                                          uint8_t *input,                                                          uint8_t *mask,                                                          int width,                                                          int textluma, int textcb,                                                          int textcr ){    uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;    int i;    for( i = 0; i < width; i++ ) {        int a = *mask;        if( a == 0xff ) {            *((uint32_t *) output) = opaque;        } else if( (input[ 0 ] == 0x00) ) {            *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24)                                       | (multiply_alpha( a, textcb ) << 16)                                       | (multiply_alpha( a, textluma ) << 8) | a;        } else if( a ) {            *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24)                                       | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16)                                       | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8)                                       |  (input[ 0 ] + multiply_alpha( a, 0xff - input[ 0 ] ));        }        mask++;        output += 4;        input += 4;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output,                                                               uint8_t *input,                                                               uint8_t *mask,                                                               int width,                                                               int textluma, int textcb,                                                               int textcr ){    uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;    const mmx_t round = { 0x0080008000800080ULL };    const mmx_t fullalpha = { 0x00000000000000ffULL };    mmx_t colour;    colour.w[ 0 ] = 0x00;    colour.w[ 1 ] = textluma;    colour.w[ 2 ] = textcb;    colour.w[ 3 ] = textcr;    movq_m2r( colour, mm1 );    movq_r2r( mm1, mm0 );    /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */    paddw_m2r( fullalpha, mm0 );    /* mm7 = 0 */    pxor_r2r( mm7, mm7 );    /* mm6 = round */    movq_m2r( round, mm6 );    while( width-- ) {        int a = *mask;        if( a == 0xff ) {            *((uint32_t *) output) = opaque;        } else if( (input[ 0 ] == 0x00) ) {            /* We just need to multiply our colour by the alpha value. */            /* mm2 = [ a ][ a ][ a ][ a ] */            movd_m2r( a, mm2 );            movq_r2r( mm2, mm3 );            pshufw_r2r( mm2, mm2, 0 );            /* mm5 = [ cr ][ cb ][ y ][ 0 ] */            movq_r2r( mm1, mm5 );            /* Multiply by alpha. */            pmullw_r2r( mm2, mm5 );            paddw_m2r( round, mm5 );            movq_r2r( mm5, mm6 );            psrlw_i2r( 8, mm6 );            paddw_r2r( mm6, mm5 );            psrlw_i2r( 8, mm5 );            /* Set alpha to a. */            por_r2r( mm3, mm5 );            /* Pack and write our result. */            packuswb_r2r( mm5, mm5 );            movd_r2m( mm5, *output );        } else if( a ) {            /* mm2 = [ a ][ a ][ a ][ a ] */            movd_m2r( a, mm2 );            pshufw_r2r( mm2, mm2, 0 );            /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */            movq_r2r( mm0, mm3 );            /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */            movd_m2r( *input, mm4 );            punpcklbw_r2r( mm7, mm4 );            /* Subtract input and colour. */            psubw_r2r( mm4, mm3 );  /* mm3 = mm3 - mm4 */            /* Multiply alpha. */            pmullw_r2r( mm2, mm3 );            paddw_r2r( mm6, mm3 );            movq_r2r( mm3, mm2 );            psrlw_i2r( 8, mm3 );            paddw_r2r( mm2, mm3 );            psrlw_i2r( 8, mm3 );            /* Add back in the input. */            paddb_r2r( mm3, mm4 );            /* Write result. */            packuswb_r2r( mm4, mm4 );            movd_r2m( mm4, *output );        }        mask++;        output += 4;        input += 4;    }    sfence();    emms();}#endifstatic void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output,                                                                uint8_t *input,                                                                uint8_t *mask, int width,                                                                int textluma, int textcb,                                                                int textcr, int alpha ){    uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;    int i;    for( i = 0; i < width; i++ ) {        int af = *mask;        if( af ) {           int a = ((af * alpha) + 0x80) >> 8;           if( a == 0xff ) {               *((uint32_t *) output) = opaque;           } else if( input[ 0 ] == 0x00 ) {               *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24)                                          | (multiply_alpha( a, textcb ) << 16)                                          | (multiply_alpha( a, textluma ) << 8) | a;           } else if( a ) {               *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24)                                         | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16)                                         | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8)                                         | (a + multiply_alpha( 0xff - a, input[ 0 ] ));           }        }        mask++;        output += 4;        input += 4;    }}static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input, int width ){    while( width-- ) {        unsigned int cur_a = input[ 0 ];        *((uint32_t *) output) = (multiply_alpha( cur_a, input[ 3 ] ) << 24)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?