mmxmotio.c
来自「symbian 下的helix player源代码」· C语言 代码 · 共 1,081 行 · 第 1/2 页
C
1,081 行
/*
while (vSize > 0) {
out[0] = (in[0] + in[hdim+0] + 1) >> 1;
out[1] = (in[1] + in[hdim+1] + 1) >> 1;
out[2] = (in[2] + in[hdim+2] + 1) >> 1;
out[3] = (in[3] + in[hdim+3] + 1) >> 1;
in += hdim;
out += hdim;
--vSize;
}
return;
*/
}
static void mc16pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize )
{
__asm {
xor ebx, ebx
mov edx, hdim ;//one row ahead
mov ecx, vSize
mov esi, inpix
mov edi, outpix
pxor mm0, mm0 ;// mm0 = 0
movq mm7, const_0x0002000200020002
mc16pels2DInterpolMMX_loop:
movq mm1, [esi + ebx] ;//00
movq mm6, [esi + ebx +8] ;//05
movq mm5, mm1 ;//01
movq mm2, mm1 ;//02
psrlq mm5, 8 ;//shift one byte ;//06
psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//07
punpcklbw mm1, mm0 ;//expand lower 4 pix ;//03
pxor mm5, mm6 ; ;//08
punpckhbw mm2, mm0 ;//expand higher 4 pix ;//04
movq mm6, mm5 ;//09
punpcklbw mm5, mm0 ;//expand lower 4 pix ;//10
paddw mm1, mm7 ;//add 2 ;//12
punpckhbw mm6, mm0 ;//expand higher 4 pix ;//11
paddw mm2, mm7 ;//add 2 ;//12
paddw mm1, mm5 ;//13
movq mm3, [esi + edx] ;//15
paddw mm2, mm6 ;//14
movq mm5, mm3 ;//16
movq mm4, mm3 ;//18
punpcklbw mm3, mm0 ;//expand lower 4 pix ;//18
movq mm6, [esi + edx + 8] ;//22
psrlq mm5, 8 ;//shift one byte ;//23
punpckhbw mm4, mm0 ;//expand higher 4 pix ;//19
psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//24
paddw mm1, mm3 ;//20
pxor mm5, mm6 ; ;//25
paddw mm2, mm4 ;//21
movq mm6, mm5 ;//26
punpcklbw mm5, mm0 ;//expand lower 4 pix ;//27
punpckhbw mm6, mm0 ;//expand higher 4 pix ;//28
paddw mm1, mm5 ;//29
paddw mm2, mm6 ;//30
psrlw mm1, 2 ;//31
psrlw mm2, 2 ;//32
movq mm6, [esi + ebx + 16] ;//40
packuswb mm1, mm2 ;//33
psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//42
movq [edi + ebx], mm1 ;//34
//-----------------//--------------//----------------
movq mm1, [esi + ebx + 8] ;//35
movq mm5, mm1 ;//36
movq mm2, mm1 ;//37
psrlq mm5, 8 ;//shift one byte ;//41
punpcklbw mm1, mm0 ;//expand lower 4 pix ;//38
pxor mm5, mm6 ; ;//43
punpckhbw mm2, mm0 ;//expand higher 4 pix ;//39
movq mm6, mm5 ;//44
punpcklbw mm5, mm0 ;//expand lower 4 pix ;//45
punpckhbw mm6, mm0 ;//expand higher 4 pix ;//46
paddw mm1, mm7 ;//add 2 ;//47
paddw mm2, mm7 ;//add 2 ;//48
paddw mm1, mm5 ;//49
paddw mm2, mm6 ;//50
movq mm3, [esi + edx + 8] ;//51
movq mm6, [esi + edx + 16] ;//58
movq mm5, mm3 ;//52
psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//60
movq mm4, mm3 ;//53
psrlq mm5, 8 ;//shift one byte ;//59
punpcklbw mm3, mm0 ;//expand lower 4 pix ;//54
pxor mm5, mm6 ; ;//61
punpckhbw mm4, mm0 ;//expand higher 4 pix ;//55
paddw mm1, mm3 ;//56
paddw mm2, mm4 ;//57
movq mm6, mm5 ;//62
punpcklbw mm5, mm0 ;//expand lower 4 pix ;//63
punpckhbw mm6, mm0 ;//expand higher 4 pix ;//64
paddw mm1, mm5 ;//65
paddw mm2, mm6 ;//66
psrlw mm1, 2 ;//67
psrlw mm2, 2 ;//68
add edx, hdim
packuswb mm1, mm2 ;//69
movq [edi + ebx + 8], mm1 ;//70
add ebx, hdim
dec ecx
jg mc16pels2DInterpolMMX_loop
emms
}
/*
while (vSize > 0) {
out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2;
out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2;
out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2;
out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2;
out[8] = (in[8] + in[9] + in[hdim+8] + in[hdim+9] + 2) >> 2;
out[9] = (in[9] + in[10] + in[hdim+9] + in[hdim+10] + 2) >> 2;
out[10] = (in[10] + in[11] + in[hdim+10] + in[hdim+11] + 2) >> 2;
out[11] = (in[11] + in[12] + in[hdim+11] + in[hdim+12] + 2) >> 2;
out[12] = (in[12] + in[13] + in[hdim+12] + in[hdim+13] + 2) >> 2;
out[13] = (in[13] + in[14] + in[hdim+13] + in[hdim+14] + 2) >> 2;
out[14] = (in[14] + in[15] + in[hdim+14] + in[hdim+15] + 2) >> 2;
out[15] = (in[15] + in[16] + in[hdim+15] + in[hdim+16] + 2) >> 2;
in += hdim;
out += hdim;
--vSize;
}
return;
*/
}
static void mc8pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize )
{
__asm {
xor ebx, ebx
mov edx, hdim ;//one row ahead
mov ecx, vSize
mov esi, inpix
mov edi, outpix
pxor mm0, mm0 ;// mm0 = 0
movq mm7, const_0x0002000200020002
mc8pels2DInterpolMMX_loop:
movq mm1, [esi + ebx]
movq mm6, [esi + ebx +8]
movq mm5, mm1
movq mm2, mm1
psrlq mm5, 8 ;//shift one byte
punpcklbw mm1, mm0 ;//expand lower 4 pix
psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4
punpckhbw mm2, mm0 ;//expand higher 4 pix
pxor mm5, mm6 ;
paddw mm1, mm7 ;//add 2
movq mm6, mm5
punpcklbw mm5, mm0 ;//expand lower 4 pix
paddw mm2, mm7 ;//add 2
punpckhbw mm6, mm0 ;//expand higher 4 pix
paddw mm1, mm5
paddw mm2, mm6
movq mm3, [esi + edx]
movq mm6, [esi + edx + 8]
movq mm5, mm3
movq mm4, mm3
punpcklbw mm3, mm0 ;//expand lower 4 pix
psrlq mm5, 8 ;//shift one byte
punpckhbw mm4, mm0 ;//expand higher 4 pix
psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4
paddw mm1, mm3
pxor mm5, mm6 ;
paddw mm2, mm4
movq mm6, mm5
punpcklbw mm5, mm0 ;//expand lower 4 pix
punpckhbw mm6, mm0 ;//expand higher 4 pix
paddw mm1, mm5
paddw mm2, mm6
psrlw mm1, 2
psrlw mm2, 2
add edx, hdim
packuswb mm1, mm2
movq [edi + ebx], mm1
add ebx, hdim
dec ecx
jg mc8pels2DInterpolMMX_loop
emms
}
/*
while (vSize > 0) {
out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2;
out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2;
out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2;
out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2;
in += hdim;
out += hdim;
--vSize;
}
return;
*/
}
static void mc4pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize )
{
__asm {
xor ebx, ebx
mov edx, hdim ;//one row ahead
mov ecx, vSize
mov esi, inpix
mov edi, outpix
pxor mm0, mm0 ;// mm0 = 0
movq mm7, const_0x0002000200020002
mc4pels2DInterpolMMX_loop:
movq mm1, [esi + ebx]
movq mm5, mm1
punpcklbw mm1, mm0 ;//expand lower 4 pix
psrlq mm5, 8 ;//shift one byte
punpcklbw mm5, mm0 ;//expand lower 4 pix
paddw mm1, mm7 ;//add 2
paddw mm1, mm5
movq mm3, [esi + edx]
movq mm5, mm3
punpcklbw mm3, mm0 ;//expand lower 4 pix
paddw mm1, mm3
psrlq mm5, 8 ;//shift one byte
punpcklbw mm5, mm0 ;//expand lower 4 pix
paddw mm1, mm5
psrlw mm1, 2
packuswb mm1, mm0
movd [edi + ebx], mm1
add ebx, hdim
add edx, hdim
dec ecx
jg mc4pels2DInterpolMMX_loop
emms
}
/*
while (vSize > 0) {
out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
in += hdim;
out += hdim;
--vSize;
}
return;
*/
}
// mc - Perform motion compensation for a hSize x vSize block
void mcMMX( int hSize, int vSize,
PIXEL *in, PIXEL *out, int hdim,
int mvX, int mvY // Motion vector
)
{
int intX, intY, fracX, fracY;
intX = mvX >> 1; // Integer part of motion vector
intY = mvY >> 1;
fracX = mvX & 0x1; // Fractional part of motion vector
fracY = mvY & 0x1;
in += intX + intY * hdim;
if (hSize != 16 && hSize != 8 && hSize != 4) {
H261ErrMsg("mc -- hSize not supported");
exit(0);
}
//#define MMX_MC_TEST
#ifdef MMX_MC_TEST
{
PIXEL in_org[17*352];
PIXEL out_org[17*352];
void mc( int hSize, int vSize, PIXEL in[], PIXEL out[], int hdim,int mvX, int mvY);
int bError = 0;
int row, col;
for(row=0; row<vSize+1; row++) {
for(col=0; col<hSize+1; col++) {
in_org[row*hdim + col] = in[row*hdim + col];
//out_org[row*hdim + col] = out[row*hdim + col];
}
}
#endif
if (fracY == 0) {
if (fracX == 0) {
// No interpolation
if (hSize == 8) {
mc8pelsNoInterpolMMX( in, out, hdim, vSize );
} else if (hSize == 16) {
mc16pelsNoInterpolMMX( in, out, hdim, vSize );
} else {
mc4pelsNoInterpolMMX( in, out, hdim, vSize );
}
} else {
// Horizontal interpolation
if (hSize == 8) {
mc8pelsHorInterpolMMX( in, out, hdim, vSize );
} else if (hSize == 16) {
mc16pelsHorInterpolMMX( in, out, hdim, vSize );
} else {
mc4pelsHorInterpolMMX( in, out, hdim, vSize );
}
}
} else if (fracX == 0) {
// Vertical interpolation
if (hSize == 8) {
mc8pelsVertInterpolMMX( in, out, hdim, vSize );
} else if (hSize == 16) {
mc16pelsVertInterpolMMX( in, out, hdim, vSize );
} else {
mc4pelsVertInterpolMMX( in, out, hdim, vSize );
}
} else { // Bilinear interpolation
if (hSize == 8) {
mc8pels2DInterpolMMX( in, out, hdim, vSize );
} else if (hSize == 16) {
mc16pels2DInterpolMMX( in, out, hdim, vSize );
} else {
mc4pels2DInterpolMMX( in, out, hdim, vSize );
}
}
#ifdef MMX_MC_TEST
mc(hSize,vSize, in_org - intX - intY * hdim, out_org, hdim, mvX, mvY );
for(row=0; row<vSize; row++) {
for(col=0; col<hSize; col++) {
if(in_org[row*hdim + col] != in[row*hdim + col]) {
bError = 1;
}
if(out_org[row*hdim + col] != out[row*hdim + col]) {
bError = 1;
}
}
}
}
#endif
return;
}
// averageBlock - compute average of two hSize*vSize pixel arrays
void averageBlockMMX( PIXEL forPred[], int hSize, int vSize, int forOffset,
PIXEL backPred[], int backOffset )
{
int row, col;
int nbPerRow = (hSize>>2) ;
//do row size multiples of 4
if(nbPerRow==2) {
__asm {//8
mov esi, forPred
mov edi, backPred
mov ecx, vSize
pxor mm0, mm0; // mm0 = 0
aver_8:
movq mm1, [esi]
movq mm3, [edi]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0 ;// expand lower 4 pix
punpckhbw mm2, mm0 ;// expand higher 4 pix
punpcklbw mm3, mm0 ;// expand lower 4 pix
punpckhbw mm4, mm0 ;// expand higher 4 pix
paddw mm1, mm3
paddw mm2, mm4
psrlw mm1, 1
psrlw mm2, 1
packuswb mm1, mm2
add edi, backOffset
movq [esi], mm1
add esi, forOffset
dec ecx
jg aver_8
emms
}
} else if(nbPerRow>=4) {
nbPerRow = 4;
__asm {//16
mov esi, forPred
mov edi, backPred
mov ecx, vSize
pxor mm0, mm0; // mm0 = 0
aver_16:
movq mm1, [esi] ;//00
movq mm5, [esi+8] ;//10
movq mm2, mm1 ;//00
punpcklbw mm1, mm0 ;// expand lower 4 pix ;//00
movq mm6, mm5 ;//10
punpckhbw mm2, mm0 ;// expand higher 4 pix ;//00
movq mm3, [edi] ;//00
punpcklbw mm5, mm0 ;// expand lower 4 pix ;//10
movq mm4, mm3 ;//00
punpckhbw mm6, mm0 ;// expand higher 4 pix ;//10
movq mm7, [edi+8] ;//10
punpcklbw mm3, mm0 ;// expand lower 4 pix ;//00
punpckhbw mm4, mm0 ;// expand higher 4 pix ;//00
paddw mm2, mm4 ;//00
paddw mm1, mm3 ;//00
movq mm4, mm7 ;//10
punpcklbw mm7, mm0 ;// expand lower 4 pix ;//10
punpckhbw mm4, mm0 ;// expand higher 4 pix ;//10
paddw mm5, mm7 ;//10
psrlw mm1, 1 ;//00
paddw mm6, mm4 ;//10
psrlw mm2, 1 ;//00
psrlw mm5, 1 ;//10
packuswb mm1, mm2 ;//00
psrlw mm6, 1 ;//10
packuswb mm5, mm6 ;//10
movq [esi], mm1 ;//00
add edi, backOffset
movq [esi+8], mm5 ;//10
add esi, forOffset
dec ecx
jg aver_16
emms
}
} else if(nbPerRow==1) {
__asm {//4
mov esi, forPred
mov edi, backPred
mov ecx, vSize
pxor mm0, mm0; // mm0 = 0
aver_4:
movq mm1, [esi]
movq mm3, [edi]
punpcklbw mm1, mm0 ;// expand lower 4 pix
punpcklbw mm3, mm0 ;// expand lower 4 pix
paddw mm1, mm3
psrlw mm1, 1
packuswb mm1, mm0
add edi, backOffset
movd [esi], mm1
add esi, forOffset
dec ecx
jg aver_4
emms
}
} else if(nbPerRow==3) {
__asm {//12
mov esi, forPred
mov edi, backPred
mov ecx, vSize
pxor mm0, mm0; // mm0 = 0
aver_12:
movq mm1, [esi] ;//00
movq mm5, [esi+8] ;//10
movq mm2, mm1 ;//00
punpcklbw mm1, mm0 ;// expand lower 4 pix ;//00
punpckhbw mm2, mm0 ;// expand higher 4 pix ;//00
movq mm3, [edi] ;//00
punpcklbw mm5, mm0 ;// expand lower 4 pix ;//10
movq mm4, mm3 ;//00
punpcklbw mm3, mm0 ;// expand lower 4 pix ;//00
punpckhbw mm4, mm0 ;// expand higher 4 pix ;//00
movq mm7, [edi+8] ;//10
paddw mm2, mm4 ;//00
punpcklbw mm7, mm0 ;// expand lower 4 pix ;//10
paddw mm1, mm3 ;//00
psrlw mm2, 1 ;//00
paddw mm5, mm7 ;//10
psrlw mm1, 1 ;//00
psrlw mm5, 1 ;//10
packuswb mm1, mm2 ;//00
packuswb mm5, mm0 ;//10
movq [esi], mm1 ;//00
add edi, backOffset
movd [esi+8], mm5 ;//10
add esi, forOffset
dec ecx
jg aver_12
emms
}
}
for (row = 0; row < vSize; ++row) {
for (col = nbPerRow<<2; col < hSize; ++col) {//from smaller multiple of 4
forPred[col] = (forPred[col] + backPred[col]) >> 1;
}
forPred += forOffset;
backPred += backOffset;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?