📄 idctchen.cpp
字号:
#include "StdAfx.h"
#pragma warning(once:4305 4244)
/*
IDCT routine in MMX (Chen algorithm).
Syntax: idct_chen_mmx1(short int *data)
where data is input block of 64 short int (16 bit integers).
Accuracy:
Although this routine is not IEEE-1180 compliant, the accuracy
should be enough for normal usage in image and video compression.
For IEEE-1180 compliance, there are 6 tests and 3 ranges of input data.
The following table is the results for the test corresponding to
the largest range of data (-300 --> 300).
Test Requirement idct_chen_mmx1 idct_chen_mmx2
------------------------------------------------------
AZ 0 0 0
PPE <=1 1 1
PMSE <=.06 .05 .02
PME <=.015 .03 .0146
OMSE <=.02 .035 .018
OME <=.0015 .010 .003
Thus idct_chen_mmx1 passes 3 tests, idct_chen_mmx2 passes 5 tests (this
routine is a little bit slower, and still in the process of simulation)
These two routines manipulate data in 16 bits integer format.
If input data is integer (32 bits) instead of short int (16 bits),
the routine can be modified to become fully IEEE-1180 compliant (trivial
and boring task), but the penalty is that it's also twice as slow.
Comments and/or recommendations should be directed to
knguyen@ece.ubc.ca
http://www.ece.ubc.ca/~knguyen/software/idct_chen_mmx1.c
MPEG2AVI
--------
v0.16B33 code has been hand-scheduled (instruction reordering.)
This will yield better performance on Pentium/MMX CPUs, but won't
help dynamic-execution CPUs (K6/2 and later, PentiumII/Celeron)
v0.16B22 Chen MMX16 IDCT imported into MPEG2AVI
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define idct_chen_mmx1_LS1 4
#define idct_chen_mmx1_RS8 idct_chen_mmx1_LS1
#define idct_chen_mmx1_LS8 (idct_chen_mmx1_RS8-1)
const static __int64 idct_chen_mmx1_c0 =0x5A825A825A825A82;
const static __int64 idct_chen_mmx1_c1 =0x7D8A7D8A7D8A7D8A;
const static __int64 idct_chen_mmx1_c2 =0x6A6E6A6E6A6E6A6E;
const static __int64 idct_chen_mmx1_c3 =0x471D471D471D471D;
const static __int64 idct_chen_mmx1_c4 =0x18F918F918F918F9;
const static __int64 idct_chen_mmx1_c5 =0x7642764276427642;
const static __int64 idct_chen_mmx1_c6 =0x30FC30FC30FC30FC;
const static __int64 idct_chen_mmx1_c0c =0x0001000100010001;
const static __int64 idct_chen_mmx1_c1c =0x0001000100010001;
const static __int64 idct_chen_mmx1_c2c =0x0001000100010001;
const static __int64 idct_chen_mmx1_c3c =0x0002000200020002;
const static __int64 idct_chen_mmx1_c4c =0x0006000600060006;
const static __int64 idct_chen_mmx1_c5c =0x0001000100010001;
const static __int64 idct_chen_mmx1_c6c =0x0003000300030003;
void idct_chen(short int *x) // x should be (32)align
{
__asm{
mov ebx, [x]
//#define mm0 mm0
//#define mm1 mm1
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
//#define mm5 mm5
//#define mm6 mm6
//#define mm7 mm7
movq mm0, [ebx+16*2]; // mm0=b2 mm0
movq mm1, [ebx+48*2]; // mm1=b3 mm1
psllw mm0, idct_chen_mmx1_LS1;
movq mm2, idct_chen_mmx1_c6c; // mm2
psllw mm1, idct_chen_mmx1_LS1;
movq mm3, idct_chen_mmx1_c5c; // mm3
movq mm4, mm0; // mm4=mm0=b2 mm4
paddsw mm0, mm2; // b2+c6c
movq mm5, mm1; // mm5=mm1=b3 mm5
movq mm6, idct_chen_mmx1_c6; // mm6
paddsw mm1, mm2; // b3+c6c mm2
pmulhw mm1, mm6; //
paddsw mm4, mm3; // b2+c5c
movq mm7, idct_chen_mmx1_c5; // mm7
pmulhw mm4, mm7; //
paddsw mm5, mm3; // b3+c5c mm3
pmulhw mm0, mm6; // mm6
movq mm3, [ebx+32*2]; // mm3=b1 mm3
pmulhw mm5, mm7; // mm7
movq mm2, [ebx+0*2]; // mm2=b0 mm2
paddsw mm1, mm4; // mm1=a3 mm4
movq mm4, idct_chen_mmx1_c0c; // mm4
psllw mm3, idct_chen_mmx1_LS1;
psllw mm2, idct_chen_mmx1_LS1;
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
psubsw mm0,mm5; // mm0=a2 mm5
movq mm6, idct_chen_mmx1_c0; // mm6
paddsw mm2, mm4; // b0+c0c
//#define mm6 mm6
pmulhw mm2, mm6;
paddsw mm3, mm4; // b1+c0c mm4
pmulhw mm3, mm6; // mm6
;//slot
//#define mm7 mm7
movq mm6, [ebx+8*2]; // mm6=a0 mm6
;//slot
psllw mm6, idct_chen_mmx1_LS1;
movq mm7, mm2; // mm7
paddsw mm2, mm3; // mm2=a0
psubsw mm7, mm3; // mm7=a1 mm3
// a(0,1,2,3) in ra(24,28,16,17)
//#define mm5 mm5
movq mm5, mm2; // mm5=mm2=a0 mm5
paddsw mm2, mm1; // mm2=b0
psubsw mm5, mm1; // mm5=b3 mm1
movq mm4, mm7; // mm4=mm7=a1 mm4
movq [ebx+0*2], mm2; // mm2
paddsw mm7, mm0; // mm7=b1
movq [ebx+48*2], mm5; // mm5
//#define mm4 mm4
// movq mm4, mm7; // mm4=mm7=a1 mm4
// paddsw mm7, mm0; // mm7=b1
psubsw mm4, mm0; // mm4=b2 mm0
movq [ebx+16*2], mm7; // mm7
movq mm5, mm6; // mm5=mm6=a0 mm5
// movq [ebx+32*2], mm4; // mm4
// b(0,1,2,3) are stored back
//#define mm6 mm6
// movq mm6, [ebx+8*2]; // mm6=a0 mm6
// psllw mm6, idct_chen_mmx1_LS1;
//#define mm3 mm3
movq mm3, [ebx+56*2]; // mm3=a3 mm3
;//slot
movq [ebx+32*2], mm4; // mm4
psllw mm3, idct_chen_mmx1_LS1;
//#define mm2 mm2
//#define mm1 mm1
//#define mm5 mm5
//#define mm7 mm7
movq mm2, idct_chen_mmx1_c4c; // mm2
movq mm7, mm3; // mm7=mm3=a3 mm7
movq mm1, idct_chen_mmx1_c1c; // mm1
// movq mm5, mm6; // mm5=mm6=a0 mm5
paddsw mm6, mm2; // a0+c4c
movq mm0, idct_chen_mmx1_c4; // mm0
paddsw mm3, mm2; // a3+c4c mm2
pmulhw mm3, mm0; //
paddsw mm5, mm1; // a0+c1c
// paddsw mm7, mm1; // a3+c1c mm1
//#define mm0 mm0
//#define mm4 mm4
// movq mm0, idct_chen_mmx1_c4; // mm0
movq mm4, idct_chen_mmx1_c1; // mm4
pmulhw mm5, mm4; //
paddsw mm7, mm1; // a3+c1c mm1
pmulhw mm6, mm0; // mm0
// pmulhw mm3, mm0; //
movq mm1, [ebx+40*2]; // mm1=a2 mm1
pmulhw mm7, mm4; // mm4
movq mm2, [ebx+24*2]; // mm2=a1 mm2
psllw mm1, idct_chen_mmx1_LS1;
paddsw mm3,mm5; // mm3=c3 mm5
// psubsw mm6,mm7; // mm6=c0 mm7
//#define mm2 mm2
//#define mm1 mm1
// movq mm2, [ebx+24*2]; // mm2=a1 mm2
psllw mm2, idct_chen_mmx1_LS1;
movq mm5, idct_chen_mmx1_c2c; // mm5
psubsw mm6,mm7; // mm6=c0 mm7
// movq mm1, [ebx+40*2]; // mm1=a2 mm1
// psllw mm1, idct_chen_mmx1_LS1;
//#define mm5 mm5
//#define mm0 mm0
// movq mm5, idct_chen_mmx1_c2c; // mm5
movq mm0, idct_chen_mmx1_c3c; // mm0
//#define mm4 mm4
//#define mm7 mm7
movq mm4, mm2; // mm4=mm2=a1 mm4
movq mm7, mm1; // mm7=mm1=a2 mm7
paddsw mm2, mm5; // a1+c2c
paddsw mm1, mm5; // a2+c2c mm5
paddsw mm4, mm0; // a1+c3c
movq mm5, idct_chen_mmx1_c2; // mm5
paddsw mm7, mm0; // a2+c3c mm0
//#define mm5 mm5
//#define mm0 mm0
// movq mm5, idct_chen_mmx1_c2; // mm5
movq mm0, idct_chen_mmx1_c3; // mm0
pmulhw mm1, mm5; //
pmulhw mm4, mm0; //
;//slot
pmulhw mm7, mm0; // mm0
;//slot
movq mm0, idct_chen_mmx1_c0c; // mm0
pmulhw mm2, mm5; // mm5
movq mm5, mm3; // mm5=mm3=c3 mm5
psubsw mm1, mm4; // mm1=c1 mm4
// paddsw mm2, mm7; // mm2=c2 mm7
// c(0,1,2,3) in ra(0,9,8,1)
//#define mm4 mm4
//#define mm5 mm5
//#define mm0 mm0
movq mm4, mm6; // mm4=mm6=c0 mm4
paddsw mm6, mm1; // mm6=a0
paddsw mm2, mm7; // mm2=c2 mm7
psubsw mm4, mm1; // mm4=a1 mm1
// movq mm5, mm3; // mm5=mm3=c3 mm5
movq mm7, idct_chen_mmx1_c0; // mm7
paddsw mm3, mm2; // mm3=a3
psubsw mm5, mm2; // mm5=a2 mm2
// a(0,1,2,3) in ra(0,31,32,1)
// movq mm0, idct_chen_mmx1_c0c; // mm0
psllw mm4, 1
psllw mm5, 1
paddsw mm4, mm0; // a1+c0c
paddsw mm5, mm0; // a2+c0c mm0
//#define mm7 mm7
//#define mm1 mm1
//#define mm2 mm2
//#define mm0 mm0
// movq mm7, idct_chen_mmx1_c0; // mm7
pmulhw mm4, mm7; // mm7
movq mm2, [ebx+0*2]; // mm2=b0 mm2
pmulhw mm5, mm7;
movq mm0, mm2; // mm0
paddsw mm2, mm3; // mm2=y0
movq mm7, [ebx+16*2]; // mm7=b1 mm7
psubsw mm0, mm3; // mm0=y7 mm3
movq mm1, mm5; // mm1
psubsw mm5, mm4; // mm5=c1
paddsw mm1, mm4; // mm1=c2 mm4
movq mm4, mm7; // mm4
movq [ebx+0*2], mm2; // mm2
paddsw mm7, mm1; // mm7=y1 // c(0,1,2,3) in ra(0,32,35,1)
movq [ebx+56*2], mm0; // mm0
//#define mm7 mm7
//#define mm4 mm4
//#define mm2 mm2
// movq mm7, [ebx+16*2]; // mm7=b1 mm7
// movq mm4, mm7; // mm4
// paddsw mm7, mm1; // mm7=y1
psubsw mm4, mm1; // mm4=y6 mm1
movq [ebx+8*2], mm7; // mm7
;//slot
movq mm2, [ebx+48*2]; // mm2=b3 mm2
movq [ebx+48*2], mm4; // mm4
//#define mm3 mm3
//#define mm0 mm0
//#define mm7 mm7
movq mm3, mm2; // mm3
movq mm0, [ebx+32*2]; // mm0=b2 mm0
paddsw mm2, mm6; // mm2=y3
psubsw mm3, mm6; // mm3=y4 mm6
movq mm7, mm0; // mm7
movq [ebx+24*2], mm2; // mm2
paddsw mm0, mm5; // mm0=y2
movq [ebx+32*2], mm3; // mm3
psubsw mm7, mm5; // mm7=y5 mm5
movq [ebx+16*2], mm0; // mm0
;//slot
movq mm0, [ebx+16*2+8]; // mm0=b2 mm0
;//slot
movq [ebx+40*2], mm7; // mm7
//#define mm0 mm0
//#define mm1 mm1
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
//#define mm5 mm5
//#define mm6 mm6
//#define mm7 mm7
// movq mm0, [ebx+16*2+8]; // mm0=b2 mm0
psllw mm0, idct_chen_mmx1_LS1;
movq mm1, [ebx+48*2+8]; // mm1=b3 mm1
movq mm4, mm0; // mm4=mm0=b2 mm4
movq mm2, idct_chen_mmx1_c6c; // mm2
psllw mm1, idct_chen_mmx1_LS1;
movq mm3, idct_chen_mmx1_c5c; // mm3
movq mm5, mm1; // mm5=mm1=b3 mm5
paddsw mm0, mm2; // b2+c6c
paddsw mm1, mm2; // b3+c6c mm2
movq mm6, idct_chen_mmx1_c6; // mm6
paddsw mm4, mm3; // b2+c5c
movq mm7, idct_chen_mmx1_c5; // mm7
pmulhw mm1, mm6; //
paddsw mm5, mm3; // b3+c5c mm3
pmulhw mm4, mm7; //
movq mm2, [ebx+0*2+8]; // mm2=b0 mm2
pmulhw mm0, mm6; // mm6
pmulhw mm5, mm7; // mm7
psllw mm2, idct_chen_mmx1_LS1;
// paddsw mm1, mm4; // mm1=a3 mm4
// psubsw mm0,mm5; // mm0=a2 mm5
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
// movq mm2, [ebx+0*2+8]; // mm2=b0 mm2
// psllw mm2, idct_chen_mmx1_LS1;
movq mm3, [ebx+32*2+8]; // mm3=b1 mm3
paddsw mm1, mm4; // mm1=a3 mm4
movq mm4, idct_chen_mmx1_c0c; // mm4
psllw mm3, idct_chen_mmx1_LS1;
movq mm6, idct_chen_mmx1_c0; // mm6
paddsw mm2, mm4; // b0+c0c
// psubsw mm0,mm5; // mm0=a2 mm5
// paddsw mm3, mm4; // b1+c0c mm4
//#define mm6 mm6
// movq mm6, idct_chen_mmx1_c0; // mm6
pmulhw mm2, mm6;
paddsw mm3, mm4; // b1+c0c mm4
pmulhw mm3, mm6; // mm6
psubsw mm0,mm5; // mm0=a2 mm5
//#define mm7 mm7
movq mm6, [ebx+8*2+8]; // mm6=a0 mm6
;//stall
movq mm7, mm2; // mm7
paddsw mm2, mm3; // mm2=a0
psubsw mm7, mm3; // mm7=a1 mm3
// a(0,1,2,3) in rb(24,28,16,17)
//#define mm5 mm5
movq mm5, mm2; // mm5=mm2=a0 mm5
paddsw mm2, mm1; // mm2=b0
psubsw mm5, mm1; // mm5=b3 mm1
movq mm4, mm7; // mm4=mm7=a1 mm4
paddsw mm7, mm0; // mm7=b1
movq [ebx+0*2+8], mm2; // mm2
psllw mm6, idct_chen_mmx1_LS1;
movq [ebx+48*2+8], mm5; // mm5
//#define mm4 mm4
// movq mm4, mm7; // mm4=mm7=a1 mm4
// paddsw mm7, mm0; // mm7=b1
psubsw mm4, mm0; // mm4=b2 mm0
movq [ebx+16*2+8], mm7; // mm7
;//stall
// movq [ebx+32*2+8], mm4; // mm4
// b(0,1,2,3) are stored back
//#define mm6 mm6
// movq mm6, [ebx+8*2+8]; // mm6=a0 mm6
// psllw mm6, idct_chen_mmx1_LS1;
//#define mm3 mm3
movq mm3, [ebx+56*2+8]; // mm3=a3 mm3
;//stall
movq [ebx+32*2+8], mm4; // mm4
psllw mm3, idct_chen_mmx1_LS1;
//#define mm2 mm2
//#define mm1 mm1
//#define mm5 mm5
//#define mm7 mm7
movq mm2, idct_chen_mmx1_c4c; // mm2
movq mm5, mm6; // mm5=mm6=a0 mm5
movq mm1, idct_chen_mmx1_c1c; // mm1
movq mm7, mm3; // mm7=mm3=a3 mm7
movq mm0, idct_chen_mmx1_c4; // mm0
paddsw mm3, mm2; // a3+c4c mm2
movq mm4, idct_chen_mmx1_c1; // mm4
pmulhw mm3, mm0; //
paddsw mm6, mm2; // a0+c4c
paddsw mm5, mm1; // a0+c1c
paddsw mm7, mm1; // a3+c1c mm1
//#define mm0 mm0
//#define mm4 mm4
// movq mm0, idct_chen_mmx1_c4; // mm0
// movq mm4, idct_chen_mmx1_c1; // mm4
// pmulhw mm3, mm0; //
pmulhw mm5, mm4; //
paddsw mm3,mm5; // mm3=c3 mm5
pmulhw mm6, mm0; // mm0
movq mm2, [ebx+24*2+8]; // mm2=a1 mm2
pmulhw mm7, mm4; // mm4
// psubsw mm6,mm7; // mm6=c0 mm7
//#define mm2 mm2
//#define mm1 mm1
// movq mm2, [ebx+24*2+8]; // mm2=a1 mm2
movq mm1, [ebx+40*2+8]; // mm1=a2 mm1
psllw mm2, idct_chen_mmx1_LS1;
movq mm5, idct_chen_mmx1_c2c; // mm5
psllw mm1, idct_chen_mmx1_LS1;
//#define mm5 mm5
//#define mm0 mm0
// movq mm5, idct_chen_mmx1_c2c; // mm5
movq mm0, idct_chen_mmx1_c3c; // mm0
psubsw mm6,mm7; // mm6=c0 mm7
//#define mm4 mm4
//#define mm7 mm7
movq mm4, mm2; // mm4=mm2=a1 mm4
movq mm7, mm1; // mm7=mm1=a2 mm7
paddsw mm2, mm5; // a1+c2c
paddsw mm1, mm5; // a2+c2c mm5
movq mm5, idct_chen_mmx1_c2; // mm5
paddsw mm4, mm0; // a1+c3c
paddsw mm7, mm0; // a2+c3c mm0
//#define mm5 mm5
//#define mm0 mm0
// movq mm5, idct_chen_mmx1_c2; // mm5
pmulhw mm1, mm5; //
movq mm0, idct_chen_mmx1_c3; // mm0
pmulhw mm2, mm5; // mm5
pmulhw mm4, mm0; //
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -