📄 i420_rgb_mmx.h
字号:
punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\movntdq %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\"#define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\pxor %%xmm3, %%xmm3 # zero mm3 \n\movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\movdqu %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\pxor %%xmm6, %%xmm6 # zero mm6 \n\punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\movdqu %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\movdqu %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\"#define SSE2_UNPACK_32_ABGR_ALIGNED " \n\pxor %%xmm3, %%xmm3 # zero mm3 \n\movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\movntdq %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\movntdq %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\movdqa %%xmm1, %%xmm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\movntdq %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\movntdq %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\"#define SSE2_UNPACK_32_ABGR_UNALIGNED " \n\pxor %%xmm3, %%xmm3 # zero mm3 \n\movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\movdqu %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\movdqu %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\movdqa %%xmm1, %%xmm2 # R7 00 R6 00 R5 00 R4 00 \n\punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\movdqu %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\"#elif defined(HAVE_SSE2_INTRINSICS)/* SSE2 intrinsics */#include <emmintrin.h>#define SSE2_CALL(SSE2_INSTRUCTIONS) \ do { \ __m128i xmm0, xmm1, xmm2, xmm3, \ xmm4, xmm5, xmm6, xmm7; \ SSE2_INSTRUCTIONS \ } while(0)#define SSE2_END _mm_sfence()#define SSE2_INIT_16_ALIGNED \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm4 = _mm_setzero_si128(); \ xmm6 = _mm_load_si128((__m128i *)p_y);#define SSE2_INIT_16_UNALIGNED \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm4 = _mm_setzero_si128(); \ xmm6 = _mm_loadu_si128((__m128i *)p_y); \ _mm_prefetch(p_buffer, _MM_HINT_NTA);#define SSE2_INIT_32_ALIGNED \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm4 = _mm_setzero_si128(); \ xmm6 = _mm_load_si128((__m128i *)p_y);#define SSE2_INIT_32_UNALIGNED \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm4 = _mm_setzero_si128(); \ xmm6 = _mm_loadu_si128((__m128i *)p_y); \ _mm_prefetch(p_buffer, _MM_HINT_NTA);#define SSE2_YUV_MUL \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ xmm5 = _mm_set1_epi32(0x00800080UL); \ xmm0 = _mm_subs_epi16(xmm0, xmm5); \ xmm1 = _mm_subs_epi16(xmm1, xmm5); \ xmm0 = _mm_slli_epi16(xmm0, 3); \ xmm1 = _mm_slli_epi16(xmm1, 3); \ xmm2 = xmm0; \ xmm3 = xmm1; \ xmm5 = _mm_set1_epi32(0xf37df37dUL); \ xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \ xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \ xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \ xmm5 = _mm_set1_epi32(0x40934093UL); \ xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \ xmm5 = _mm_set1_epi32(0x33123312UL); \ xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \ xmm2 = _mm_adds_epi16(xmm2, xmm3); \ \ xmm5 = _mm_set1_epi32(0x10101010UL); \ xmm6 = _mm_subs_epu8(xmm6, xmm5); \ xmm7 = xmm6; \ xmm5 = _mm_set1_epi32(0x00ff00ffUL); \ xmm6 = _mm_and_si128(xmm6, xmm5); \ xmm7 = _mm_srli_epi16(xmm7, 8); \ xmm6 = _mm_slli_epi16(xmm6, 3); \ xmm7 = _mm_slli_epi16(xmm7, 3); \ xmm5 = _mm_set1_epi32(0x253f253fUL); \ xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \ xmm7 = _mm_mulhi_epi16(xmm7, xmm5);#define SSE2_YUV_ADD \ xmm3 = xmm0; \ xmm4 = xmm1; \ xmm5 = xmm2; \ xmm0 = _mm_adds_epi16(xmm0, xmm6); \ xmm3 = _mm_adds_epi16(xmm3, xmm7); \ xmm1 = _mm_adds_epi16(xmm1, xmm6); \ xmm4 = _mm_adds_epi16(xmm4, xmm7); \ xmm2 = _mm_adds_epi16(xmm2, xmm6); \ xmm5 = _mm_adds_epi16(xmm5, xmm7); \ \ xmm0 = _mm_packus_epi16(xmm0, xmm0); \ xmm1 = _mm_packus_epi16(xmm1, xmm1); \ xmm2 = _mm_packus_epi16(xmm2, xmm2); \ \ xmm3 = _mm_packus_epi16(xmm3, xmm3); \ xmm4 = _mm_packus_epi16(xmm4, xmm4); \ xmm5 = _mm_packus_epi16(xmm5, xmm5); \ \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);#define SSE2_UNPACK_15_ALIGNED \ xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ xmm0 = _mm_and_si128(xmm0, xmm5); \ xmm0 = _mm_srli_epi16(xmm0, 3); \ xmm2 = _mm_and_si128(xmm2, xmm5); \ xmm1 = _mm_and_si128(xmm1, xmm5); \ xmm1 = _mm_srli_epi16(xmm1, 1); \ xmm4 = _mm_setzero_si128(); \ xmm5 = xmm0; \ xmm7 = xmm2; \ \ xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ xmm2 = _mm_slli_epi16(xmm2, 2); \ xmm0 = _mm_or_si128(xmm0, xmm2); \ _mm_stream_si128((__m128i*)p_buffer, xmm0); \ \ xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ xmm7 = _mm_slli_epi16(xmm7, 2); \ xmm5 = _mm_or_si128(xmm5, xmm7); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);#define SSE2_UNPACK_15_UNALIGNED \ xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ xmm0 = _mm_and_si128(xmm0, xmm5); \ xmm0 = _mm_srli_epi16(xmm0, 3); \ xmm2 = _mm_and_si128(xmm2, xmm5); \ xmm1 = _mm_and_si128(xmm1, xmm5); \ xmm1 = _mm_srli_epi16(xmm1, 1); \ xmm4 = _mm_setzero_si128(); \ xmm5 = xmm0; \ xmm7 = xmm2; \ \ xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ xmm2 = _mm_slli_epi16(xmm2, 2); \ xmm0 = _mm_or_si128(xmm0, xmm2); \ _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ \ xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ xmm7 = _mm_slli_epi16(xmm7, 2); \ xmm5 = _mm_or_si128(xmm5, xmm7); \ _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);#define SSE2_UNPACK_16_ALIGNED \ xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ xmm0 = _mm_and_si128(xmm0, xmm5); \ xmm1 = _mm_and_si128(xmm1, xmm5); \ xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ xmm2 = _mm_and_si128(xmm2, xmm5); \ xmm0 = _mm_srli_epi16(xmm0, 3); \ xmm4 = _mm_setzero_si128(); \ xmm5 = xmm0; \ xmm7 = xmm2; \ \ xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ xmm2 = _mm_sl
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -