📄 dsp_mmx.c
字号:
movq mm1, mm0 ;
punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
paddw mm7, mm0 ; /* accumulate difference... */
punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
add ebx, SrcStride ; /* Inc pointer into the new data */
paddw mm7, mm1 ; /* accumulate difference... */
add ecx, RefStride ; /* Inc pointer into ref data */
add edx, RefStride ; /* Inc pointer into ref data */
dec edi ;
jnz loop_start ;
movq mm0, mm7 ;
psrlq mm7, 32 ;
paddw mm7, mm0 ;
movq mm0, mm7 ;
psrlq mm7, 16 ;
paddw mm7, mm0 ;
movd eax, mm7 ;
and eax, 0xffff ;
mov DiffVal, eax
};
return DiffVal;
#endif
}
static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
{
#if 0
ogg_uint32_t i; ogg_uint32_t XSum=0; ogg_uint32_t XXSum=0; for (i=8; i; i--) { /* Examine alternate pixel locations. */ XSum += DataPtr[0]; XXSum += DataPtr[0]*DataPtr[0]; XSum += DataPtr[1]; XXSum += DataPtr[1]*DataPtr[1]; XSum += DataPtr[2]; XXSum += DataPtr[2]*DataPtr[2]; XSum += DataPtr[3]; XXSum += DataPtr[3]*DataPtr[3]; XSum += DataPtr[4]; XXSum += DataPtr[4]*DataPtr[4]; XSum += DataPtr[5]; XXSum += DataPtr[5]*DataPtr[5]; XSum += DataPtr[6]; XXSum += DataPtr[6]*DataPtr[6]; XSum += DataPtr[7]; XXSum += DataPtr[7]*DataPtr[7]; /* Step to next row of block. */ DataPtr += Stride; } /* Compute population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum ) );
#else
ogg_uint32_t XSum;
ogg_uint32_t XXSum;
__asm {
align 16
mov ecx, DataPtr
pxor mm5, mm5 ;
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov edi, 8 ;
loop_start:
movq mm0, [ecx] ; /* take 8 bytes */
movq mm2, mm0 ;
punpcklbw mm0, mm6 ;
punpckhbw mm2, mm6 ;
paddw mm5, mm0 ;
paddw mm5, mm2 ;
pmaddwd mm0, mm0 ;
pmaddwd mm2, mm2 ;
;
paddd mm7, mm0 ;
paddd mm7, mm2 ;
add ecx, Stride ; /* Inc pointer into src data */
dec edi ;
jnz loop_start ;
movq mm0, mm5 ;
psrlq mm5, 32 ;
paddw mm5, mm0 ;
movq mm0, mm5 ;
psrlq mm5, 16 ;
paddw mm5, mm0 ;
movd edi, mm5 ;
movsx edi, di ;
mov eax, edi ;
movq mm0, mm7 ;
psrlq mm7, 32 ;
paddd mm7, mm0 ;
movd ebx, mm7 ;
mov XSum, eax
mov XXSum, ebx;
};
/* Compute population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ) );
#endif
}
static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
unsigned char *RefDataPtr, ogg_uint32_t RefStride)
{
#if 0
ogg_uint32_t i; ogg_uint32_t XSum=0; ogg_uint32_t XXSum=0; ogg_int32_t DiffVal; for (i=8; i; i--) { DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]); XSum += DiffVal; XXSum += DiffVal*DiffVal; /* Step to next row of block. */ SrcData += SrcStride; RefDataPtr += RefStride; } /* Compute and return population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum ));
#else
ogg_uint32_t XSum;
ogg_uint32_t XXSum;
__asm {
align 16
mov ecx, SrcData
mov edx, RefDataPtr
pxor mm5, mm5 ;
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov edi, 8 ;
loop_start: ;
movq mm0, [ecx] ; /* take 8 bytes */
movq mm1, [edx] ;
movq mm2, mm0 ;
movq mm3, mm1 ;
punpcklbw mm0, mm6 ;
punpcklbw mm1, mm6 ;
punpckhbw mm2, mm6 ;
punpckhbw mm3, mm6 ;
psubsw mm0, mm1 ;
psubsw mm2, mm3 ;
paddw mm5, mm0 ;
paddw mm5, mm2 ;
pmaddwd mm0, mm0 ;
pmaddwd mm2, mm2 ;
;
paddd mm7, mm0 ;
paddd mm7, mm2 ;
add ecx, SrcStride ; /* Inc pointer into src data */
add edx, RefStride ; /* Inc pointer into ref data */
dec edi ;
jnz loop_start ;
movq mm0, mm5 ;
psrlq mm5, 32 ;
paddw mm5, mm0 ;
movq mm0, mm5 ;
psrlq mm5, 16 ;
paddw mm5, mm0 ;
movd edi, mm5 ;
movsx edi, di ;
mov eax, edi ;
movq mm0, mm7 ;
psrlq mm7, 32 ;
paddd mm7, mm0 ;
movd ebx, mm7 ;
mov XSum, eax
mov XXSum, ebx
};
/* Compute and return population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ));
#endif
}
static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
unsigned char *RefDataPtr1,
unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
#if 0
ogg_uint32_t i; ogg_uint32_t XSum=0; ogg_uint32_t XXSum=0; ogg_int32_t DiffVal; for (i=8; i; i--) { DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); XSum += DiffVal; XXSum += DiffVal*DiffVal; /* Step to next row of block. */ SrcData += SrcStride; RefDataPtr1 += RefStride; RefDataPtr2 += RefStride; } /* Compute and return population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum ));
#else
ogg_uint32_t XSum;
ogg_uint32_t XXSum;
__asm {
align 16
mov ebx, SrcData
mov ecx, RefDataPtr1
mov edx, RefDataPtr2
pcmpeqd mm4, mm4 ; /* fefefefefefefefe in mm4 */
paddb mm4, mm4 ;
pxor mm5, mm5 ;
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov edi, 8 ;
loop_start: ;
movq mm0, [ebx] ; /* take 8 bytes */
movq mm2, [ecx] ;
movq mm3, [edx] ; /* take average of mm2 and mm3 */
movq mm1, mm2 ;
pand mm1, mm3 ;
pxor mm3, mm2 ;
pand mm3, mm4 ;
psrlq mm3, 1 ;
paddb mm1, mm3 ;
movq mm2, mm0 ;
movq mm3, mm1 ;
punpcklbw mm0, mm6 ;
punpcklbw mm1, mm6 ;
punpckhbw mm2, mm6 ;
punpckhbw mm3, mm6 ;
psubsw mm0, mm1 ;
psubsw mm2, mm3 ;
paddw mm5, mm0 ;
paddw mm5, mm2 ;
pmaddwd mm0, mm0 ;
pmaddwd mm2, mm2 ;
;
paddd mm7, mm0 ;
paddd mm7, mm2 ;
add ebx, SrcStride ; /* Inc pointer into src data */
add ecx, RefStride ; /* Inc pointer into ref data */
add edx, RefStride ; /* Inc pointer into ref data */
dec edi ;
jnz loop_start ;
movq mm0, mm5 ;
psrlq mm5, 32 ;
paddw mm5, mm0 ;
movq mm0, mm5 ;
psrlq mm5, 16 ;
paddw mm5, mm0 ;
movd edi, mm5 ;
movsx edi, di ;
mov XSum, edi ; /* movl eax, edi ; Modified for vc to resuse eax*/
movq mm0, mm7 ;
psrlq mm7, 32 ;
paddd mm7, mm0 ;
movd XXSum, mm7 ; /*movd eax, mm7 ; Modified for vc to reuse eax */
};
return (( (XXSum<<6) - XSum*XSum ));
#endif
}
static void restore_fpu (void)
{
__asm {
emms
}
}
void dsp_mmx_init(DspFunctions *funcs)
{
TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
funcs->restore_fpu = restore_fpu;
funcs->sub8x8 = sub8x8__mmx;
funcs->sub8x8_128 = sub8x8_128__mmx;
funcs->sub8x8avg2 = sub8x8avg2__mmx;
funcs->row_sad8 = row_sad8__mmx;
funcs->col_sad8x8 = col_sad8x8__mmx;
funcs->sad8x8 = sad8x8__mmx;
funcs->sad8x8_thres = sad8x8_thres__mmx;
funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
funcs->intra8x8_err = intra8x8_err__mmx;
funcs->inter8x8_err = inter8x8_err__mmx;
funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -