speedy.c
来自「linux下的MPEG1」· C语言 代码 · 共 2,101 行 · 第 1/5 页
C
2,101 行
foreground += 4; output += 2; input += 2; }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input, uint8_t *foreground, int width, int alpha ){ const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; const mmx_t round = { 0x0080008000800080ULL }; int i; if( !alpha ) { blit_packed422_scanline( output, input, width ); return; } if( alpha == 256 ) { composite_packed4444_to_packed422_scanline( output, input, foreground, width ); return; } READ_PREFETCH_2048( input ); READ_PREFETCH_2048( foreground ); movq_m2r( alpha, mm2 ); pshufw_r2r( mm2, mm2, 0 ); pxor_r2r( mm7, mm7 ); for( i = width/2; i; i-- ) { int fg1 = *((uint32_t *) foreground); int fg2 = *(((uint32_t *) foreground)+1); if( fg1 || fg2 ) { /* mm1 = [ cr ][ y ][ cb ][ y ] */ movd_m2r( *input, mm1 ); punpcklbw_r2r( mm7, mm1 ); movq_m2r( *foreground, mm3 ); movq_r2r( mm3, mm4 ); punpcklbw_r2r( mm7, mm3 ); punpckhbw_r2r( mm7, mm4 ); /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ pshufw_r2r( mm3, mm5, 0 ); pshufw_r2r( mm4, mm6, 0 ); /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ pshufw_r2r( mm3, mm3, 201 ); /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ pshufw_r2r( mm4, mm4, 16 ); pand_m2r( alpha1, mm3 ); pand_m2r( alpha2, mm4 ); pand_m2r( alpha1, mm5 ); pand_m2r( alpha2, mm6 ); por_r2r( mm4, mm3 ); por_r2r( mm6, mm5 ); /* now, mm5 is af and mm1 is B. Need to multiply them. */ pmullw_r2r( mm1, mm5 ); /* Multiply by appalpha. */ pmullw_r2r( mm2, mm3 ); paddw_m2r( round, mm3 ); psrlw_i2r( 8, mm3 ); /* Result is now B + F. */ paddw_r2r( mm3, mm1 ); /* Round up appropriately. */ paddw_m2r( round, mm5 ); /* mm6 contains our i>>8; */ movq_r2r( mm5, mm6 ); psrlw_i2r( 8, mm6 ); /* Add mm6 back into mm5. Now our result is in the high bytes. */ paddw_r2r( mm6, mm5 ); /* Shift down. */ psrlw_i2r( 8, mm5 ); /* Multiply by appalpha. */ pmullw_r2r( mm2, mm5 ); paddw_m2r( round, mm5 ); psrlw_i2r( 8, mm5 ); psubusw_r2r( mm5, mm1 ); /* mm1 = [ B + F - af*B ] */ packuswb_r2r( mm1, mm1 ); movd_r2m( mm1, *output ); } foreground += 8; output += 4; input += 4; } sfence(); emms();}#endifstatic void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8_t *input, uint8_t *foreground, int width ){ int i; for( i = 0; i < width; i++ ) { int a = foreground[ 0 ]; if( a == 0xff ) { output[ 0 ] = foreground[ 1 ]; if( ( i & 1 ) == 0 ) { output[ 1 ] = foreground[ 2 ]; output[ 3 ] = foreground[ 3 ]; } } else if( a ) { /** * (1 - alpha)*B + alpha*F * B + af*F - af*B */ output[ 0 ] = input[ 0 ] + foreground[ 1 ] - multiply_alpha( foreground[ 0 ], input[ 0 ] ); if( ( i & 1 ) == 0 ) { /** * C_r = (1 - af)*B + af*F * C_r = B - af*B + af*F */ output[ 1 ] = input[ 1 ] + foreground[ 2 ] - multiply_alpha( foreground[ 0 ], input[ 1 ] ); output[ 3 ] = input[ 3 ] + foreground[ 3 ] - multiply_alpha( foreground[ 0 ], input[ 3 ] ); } } foreground += 4; output += 2; input += 2; }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input, uint8_t *foreground, int width ){ const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; const mmx_t round = { 0x0080008000800080ULL }; int i; READ_PREFETCH_2048( input ); READ_PREFETCH_2048( foreground ); pxor_r2r( mm7, mm7 ); for( i = width/2; i; i-- ) { int fg1 = *((uint32_t *) foreground); int fg2 = *(((uint32_t *) foreground)+1); if( (fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff ) { movq_m2r( *foreground, mm3 ); movq_r2r( mm3, mm4 ); punpcklbw_r2r( mm7, mm3 ); punpckhbw_r2r( mm7, mm4 ); /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ pshufw_r2r( mm3, mm3, 201 ); /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ pshufw_r2r( mm4, mm4, 16 ); pand_m2r( alpha1, mm3 ); pand_m2r( alpha2, mm4 ); por_r2r( mm4, mm3 ); /* mm1 = [ B + F - af*B ] */ packuswb_r2r( mm3, mm3 ); movd_r2m( mm3, *output ); } else if( fg1 || fg2 ) { /* mm1 = [ cr ][ y ][ cb ][ y ] */ movd_m2r( *input, mm1 ); punpcklbw_r2r( mm7, mm1 ); movq_m2r( *foreground, mm3 ); movq_r2r( mm3, mm4 ); punpcklbw_r2r( mm7, mm3 ); punpckhbw_r2r( mm7, mm4 ); /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ pshufw_r2r( mm3, mm5, 0 ); pshufw_r2r( mm4, mm6, 0 ); /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ pshufw_r2r( mm3, mm3, 201 ); /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ pshufw_r2r( mm4, mm4, 16 ); pand_m2r( alpha1, mm3 ); pand_m2r( alpha2, mm4 ); pand_m2r( alpha1, mm5 ); pand_m2r( alpha2, mm6 ); por_r2r( mm4, mm3 ); por_r2r( mm6, mm5 ); /* now, mm5 is af and mm1 is B. Need to multiply them. */ pmullw_r2r( mm1, mm5 ); /* Result is now B + F. */ paddw_r2r( mm3, mm1 ); /* Round up appropriately. */ paddw_m2r( round, mm5 ); /* mm6 contains our i>>8; */ movq_r2r( mm5, mm6 ); psrlw_i2r( 8, mm6 ); /* Add mm6 back into mm5. Now our result is in the high bytes. */ paddw_r2r( mm6, mm5 ); /* Shift down. */ psrlw_i2r( 8, mm5 ); psubusw_r2r( mm5, mm1 ); /* mm1 = [ B + F - af*B ] */ packuswb_r2r( mm1, mm1 ); movd_r2m( mm1, *output ); } foreground += 8; output += 4; input += 4; } sfence(); emms();}#endif/** * um... just need some scrap paper... * D = (1 - alpha)*B + alpha*F * D = (1 - a)*B + a*textluma * = B - a*B + a*textluma * = B + a*(textluma - B) * Da = (1 - a)*b + a */static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output, uint8_t *input, uint8_t *mask, int width, int textluma, int textcb, int textcr ){ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; int i; for( i = 0; i < width; i++ ) { int a = *mask; if( a == 0xff ) { *((uint32_t *) output) = opaque; } else if( (input[ 0 ] == 0x00) ) { *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24) | (multiply_alpha( a, textcb ) << 16) | (multiply_alpha( a, textluma ) << 8) | a; } else if( a ) { *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24) | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16) | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8) | (input[ 0 ] + multiply_alpha( a, 0xff - input[ 0 ] )); } mask++; output += 4; input += 4; }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output, uint8_t *input, uint8_t *mask, int width, int textluma, int textcb, int textcr ){ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; const mmx_t round = { 0x0080008000800080ULL }; const mmx_t fullalpha = { 0x00000000000000ffULL }; mmx_t colour; colour.w[ 0 ] = 0x00; colour.w[ 1 ] = textluma; colour.w[ 2 ] = textcb; colour.w[ 3 ] = textcr; movq_m2r( colour, mm1 ); movq_r2r( mm1, mm0 ); /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */ paddw_m2r( fullalpha, mm0 ); /* mm7 = 0 */ pxor_r2r( mm7, mm7 ); /* mm6 = round */ movq_m2r( round, mm6 ); while( width-- ) { int a = *mask; if( a == 0xff ) { *((uint32_t *) output) = opaque; } else if( (input[ 0 ] == 0x00) ) { /* We just need to multiply our colour by the alpha value. */ /* mm2 = [ a ][ a ][ a ][ a ] */ movd_m2r( a, mm2 ); movq_r2r( mm2, mm3 ); pshufw_r2r( mm2, mm2, 0 ); /* mm5 = [ cr ][ cb ][ y ][ 0 ] */ movq_r2r( mm1, mm5 ); /* Multiply by alpha. */ pmullw_r2r( mm2, mm5 ); paddw_m2r( round, mm5 ); movq_r2r( mm5, mm6 ); psrlw_i2r( 8, mm6 ); paddw_r2r( mm6, mm5 ); psrlw_i2r( 8, mm5 ); /* Set alpha to a. */ por_r2r( mm3, mm5 ); /* Pack and write our result. */ packuswb_r2r( mm5, mm5 ); movd_r2m( mm5, *output ); } else if( a ) { /* mm2 = [ a ][ a ][ a ][ a ] */ movd_m2r( a, mm2 ); pshufw_r2r( mm2, mm2, 0 ); /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */ movq_r2r( mm0, mm3 ); /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */ movd_m2r( *input, mm4 ); punpcklbw_r2r( mm7, mm4 ); /* Subtract input and colour. */ psubw_r2r( mm4, mm3 ); /* mm3 = mm3 - mm4 */ /* Multiply alpha. */ pmullw_r2r( mm2, mm3 ); paddw_r2r( mm6, mm3 ); movq_r2r( mm3, mm2 ); psrlw_i2r( 8, mm3 ); paddw_r2r( mm2, mm3 ); psrlw_i2r( 8, mm3 ); /* Add back in the input. */ paddb_r2r( mm3, mm4 ); /* Write result. */ packuswb_r2r( mm4, mm4 ); movd_r2m( mm4, *output ); } mask++; output += 4; input += 4; } sfence(); emms();}#endifstatic void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output, uint8_t *input, uint8_t *mask, int width, int textluma, int textcb, int textcr, int alpha ){ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; int i; for( i = 0; i < width; i++ ) { int af = *mask; if( af ) { int a = ((af * alpha) + 0x80) >> 8; if( a == 0xff ) { *((uint32_t *) output) = opaque; } else if( input[ 0 ] == 0x00 ) { *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24) | (multiply_alpha( a, textcb ) << 16) | (multiply_alpha( a, textluma ) << 8) | a; } else if( a ) { *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24) | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16) | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8) | (a + multiply_alpha( 0xff - a, input[ 0 ] )); } } mask++; output += 4; input += 4; }}static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input, int width ){ while( width-- ) { unsigned int cur_a = input[ 0 ]; *((uint32_t *) output) = (multiply_alpha( cur_a, input[ 3 ] ) << 24)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?