📄 dsp_mmx.c
字号:
for ( i = 0; i < 4; i++ ){ SadValue2[0] += abs(Src1[0] - Src2[0]); SadValue2[1] += abs(Src1[1] - Src2[1]); SadValue2[2] += abs(Src1[2] - Src2[2]); SadValue2[3] += abs(Src1[3] - Src2[3]); SadValue2[4] += abs(Src1[4] - Src2[4]); SadValue2[5] += abs(Src1[5] - Src2[5]); SadValue2[6] += abs(Src1[6] - Src2[6]); SadValue2[7] += abs(Src1[7] - Src2[7]); Src1 += stride; Src2 += stride; } for ( i = 0; i < 8; i++ ){ if ( SadValue[i] > MaxSad ) MaxSad = SadValue[i]; if ( SadValue2[i] > MaxSad ) MaxSad = SadValue2[i]; } return MaxSad;
#else
ogg_uint32_t MaxSad;
__asm {
align 16
mov ebx, Src1
mov ecx, Src2
pxor mm3, mm3 ; /* zero out mm3 for unpack */
pxor mm4, mm4 ; /* mm4 low sum */
pxor mm5, mm5 ; /* mm5 high sum */
pxor mm6, mm6 ; /* mm6 low sum */
pxor mm7, mm7 ; /* mm7 high sum */
mov edi, 4 ; /* 4 rows */
label_1: ;
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [ecx] ; /* take 8 bytes */
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
paddw mm4, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
paddw mm5, mm1 ; /* accumulate difference... */
add ebx, stride ; /* Inc pointer into the new data */
add ecx, stride ; /* Inc pointer into the new data */
dec edi ;
jnz label_1 ;
mov edi, 4 ; /* 4 rows */
label_2: ;
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [ecx] ; /* take 8 bytes */
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
paddw mm6, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
paddw mm7, mm1 ; /* accumulate difference... */
add ebx, stride ; /* Inc pointer into the new data */
add ecx, stride ; /* Inc pointer into the new data */
dec edi ;
jnz label_2 ;
psubusw mm7, mm6 ;
paddw mm7, mm6 ; /* mm7 = max(mm7, mm6) */
psubusw mm5, mm4 ;
paddw mm5, mm4 ; /* mm5 = max(mm5, mm4) */
psubusw mm7, mm5 ;
paddw mm7, mm5 ; /* mm7 = max(mm5, mm7) */
movq mm6, mm7 ;
psrlq mm6, 32 ;
psubusw mm7, mm6 ;
paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
movq mm6, mm7 ;
psrlq mm6, 16 ;
psubusw mm7, mm6 ;
paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
movd eax, mm7 ;
and eax, 0xffff ;
mov MaxSad, eax
};
return MaxSad;
#endif
}
static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
unsigned char *ptr2, ogg_uint32_t stride2)
{
#if 0
ogg_uint32_t i; ogg_uint32_t sad = 0; for (i=8; i; i--) { sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); /* Step to next row of block. */ ptr1 += stride1; ptr2 += stride2; } return sad;
#else
ogg_uint32_t DiffVal;
__asm {
align 16
mov ebx, ptr1
mov edx, ptr2
pxor mm6, mm6 ; /* zero out mm6 for unpack */
pxor mm7, mm7 ; /* mm7 contains the result */
; /* ITERATION 1 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 2 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 3 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 4 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 5 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 6 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 7 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 8 */
movq mm0, [ebx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, stride1 ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add edx, stride2 ; /* Inc pointer into ref data */
; /* ------ */
movq mm0, mm7 ;
psrlq mm7, 32 ;
paddw mm7, mm0 ;
movq mm0, mm7 ;
psrlq mm7, 16 ;
paddw mm7, mm0 ;
movd eax, mm7 ;
and eax, 0xffff ;
mov DiffVal, eax
};
return DiffVal;
#endif
}
static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
unsigned char *ptr2, ogg_uint32_t stride2,
ogg_uint32_t thres)
{
#if 0
ogg_uint32_t i; ogg_uint32_t sad = 0; for (i=8; i; i--) { sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); if (sad > thres ) break; /* Step to next row of block. */ ptr1 += stride1; ptr2 += stride2; } return sad;
#else
return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
#endif
}
static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
unsigned char *RefDataPtr1,
unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
ogg_uint32_t thres)
{
#if 0
ogg_uint32_t i; ogg_uint32_t sad = 0; for (i=8; i; i--) { sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); if ( sad > thres ) break; /* Step to next row of block. */ SrcData += SrcStride; RefDataPtr1 += RefStride; RefDataPtr2 += RefStride; } return sad;
#else
ogg_uint32_t DiffVal;
__asm {
align 16
mov ebx, SrcData
mov ecx, RefDataPtr1
mov edx, RefDataPtr2
pcmpeqd mm5, mm5 ; /* fefefefefefefefe in mm5 */
paddb mm5, mm5 ;
;
pxor mm6, mm6 ; /* zero out mm6 for unpack */
pxor mm7, mm7 ; /* mm7 contains the result */
mov edi, 8 ; /* 8 rows */
loop_start: ;
movq mm0, [ebx] ; /* take 8 bytes */
movq mm2, [ecx] ;
movq mm3, [edx] ; /* take average of mm2 and mm3 */
movq mm1, mm2 ;
pand mm1, mm3 ;
pxor mm3, mm2 ;
pand mm3, mm5 ;
psrlq mm3, 1 ;
paddb mm1, mm3 ;
movq mm2, mm0 ;
psubusb mm0, mm1 ; /* A - B */
psubusb mm1, mm2 ; /* B - A */
por mm0, mm1 ; /* and or gives abs difference */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -