📄 dequantize_mmx.h
字号:
DEQUANTIZE_INTRA_LOCAL_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTRA_LOCAL_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTRA_LOCAL_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTRA_LOCAL_STEP(7) DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory");}static void inline dequantize_inter_global(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch){ unsigned int m;#define DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(x) \ "movq 0x" #x "0(%0), %%mm4\n" /* load 1st line 1st half */ \ "pxor %%mm2, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm4, %%mm0\n" \ "movq 0x" #x "8(%0), %%mm5\n" /* load 1st line 2nd half */ \ "pxor %%mm3, %%mm3\n" /* mm3 = 1st line 1st half */ \ "movq %%mm5, %%mm1\n" \ "psllw $1, %%mm0\n" /* mm0 = 2*mm0 */ \ "pcmpgtw %%mm4, %%mm2\n" /* mm2 = (mm0<0)?0xffff:0x0000 */ \ "psllw $1, %%mm1\n" /* mm1 = 2*mm1 */ \ "pcmpgtw %%mm5, %%mm3\n" /* mm3 = (mm1<0)?0xffff:0x0000 */ \ "pxor %%mm2, %%mm0\n" /* mm0 = 2*|mm0|-(mm0<0)*/ \ "pxor %%mm3, %%mm1\n" /* mm1 = 2*|mm1|-(mm1<0)*/ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4 = (mm0==0)?0xffff:0x0000 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5 = (mm1==0)?0xffff:0x0000 */ \ "psubsw %%mm2, %%mm0\n" /* mm0 = 2*|mm0| */ \ "psubsw %%mm3, %%mm1\n" /* mm1 = 2*|mm1| */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4 = (mm0==0)?0x0000:0xffff */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5 = (mm1==0)?0x0000:0xffff */ \ "psubw %%mm4, %%mm0\n" /* mm0 = 2*|mm0|+(mm0!=0) */ \ "psubw %%mm5, %%mm1\n" /* mm1 = 2*|mm0|+(mm0!=0) */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=(2*|mm0|+1)*Q */ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=(2*|mm0|+1)*Q */ \ "psraw $0x04, %%mm0\n" /* divide by 16 */ \ "psraw $0x04, %%mm1\n" /* divide by 16 */ \ "pxor %%mm2, %%mm0\n" /* mm0 =(2*|mm0|+1)*Q*sign(mm0)-(mm0<0)*/ \ "pxor %%mm3, %%mm1\n" /* mm1 =(2*|mm1|+1)*Q*sign(mm1)-(mm1<0)*/ \ "psubsw %%mm2, %%mm0\n" /* mm0 =(2*|mm0|+1)*Q*sign(mm0) */ \ "psubsw %%mm3, %%mm1\n" /* mm1 =(2*|mm1|+1)*Q*sign(mm1) */ \ "pxor %%mm0, %%mm6\n" /* accumulate mismatch */ \ "pxor %%mm1, %%mm6\n" /* accumulate mismatch */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(7) DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() /* WARNING : mismatch control is too small and would be zeroed */ /* by prescale. This would cause artifacts on the long term */ /* since the last coefficient has high chances of being 0 */ /* and thus should be rounded up most of the time. */ /* Thus we accumulate mismatch instead until it gets */ /* large enough to produce significant output after iDCT */ /* resetting the accumulator when the block is coded intra */ DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory"); asm volatile("movd %%mm6, %0\n" /* export mismatch */ : "=r"(m) : /* no input */ ); *mismatch += (dct_t) (m<<12); /* threshold is ((1 << 16)/(16*psmatrix[63]) * (1 << 12) + 0.5) = 26887 */#define MISMATCH_THRESHOLD 26887 if(*mismatch > MISMATCH_THRESHOLD) { /* after this threshold, prescaled mismatch is >= 1 */ cache[63] ++; /* add mismatch */ *mismatch -= MISMATCH_THRESHOLD; } if(*mismatch < (-26887)) { cache[63] --; /* sub mismatch */ *mismatch += MISMATCH_THRESHOLD; }}static void inline dequantize_inter_local(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch /* not used */){ /* coeff[i] = ((2*level[i]+sign(level[i]))*qscale*matrix[i])/16 */ /* then coeff[i] = { coeff[i] + 1, if coeff[i] < 0 and coeff[i] is even */ /* { coeff[i] - 1, if coeff[i] > 0 and coeff[i] is even */ /* { coeff[i] otherwise */ /* TODO: check efficiency of new inter_global method on this */#define DEQUANTIZE_INTER_LOCAL_STEP(x) \ "movq 0x" #x "0(%0), %%mm0\n" /* load 1st line 1st half */ \ "movq 0x" #x "8(%0), %%mm1\n" /* load 1st line 2nd half */ \ "movq %%mm0, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm1, %%mm3\n" /* mm3 = 1st line 1st half */ \ "psraw $0x0f, %%mm2\n" /* mm2 = (sign(mm0) - 1) / 2 */ \ "psraw $0x0f, %%mm3\n" /* mm3 = (sign(mm0) - 1) / 2 */ \ "paddsw %%mm2, %%mm0\n" /* mm0 = [0-3]+(sign([0-3])-1)/2*/ \ "paddsw %%mm3, %%mm1\n" /* mm1 = [4-7]+(sign([0-3])-1)/2*/ \ "paddsw %%mm0, %%mm0\n" /* mm0 = 2*[0-3]+sign([0-3])-1 */ \ "paddsw %%mm1, %%mm1\n" /* mm1 = 2*[4-7]+sign([4-7])-1 */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/ \ "movq %%mm0, %%mm4\n" /* mm4 = mm0 */ \ "movq %%mm1, %%mm5\n" /* mm5 = mm1 */ \ "paddsw 0x" #x "0(%1), %%mm0\n" /* mm0=(2*[0-3]+sign([0-3]))*Q*/ \ "paddsw 0x" #x "8(%1), %%mm1\n" /* mm1=(2*[4-7]+sign([4-7]))*Q*/ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm4[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm5[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm0[0-3]!=0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm1[0-3]!=0 */ \ "psllw $0x04, %%mm2\n" /* sign adjust before shift */ \ "psllw $0x04, %%mm3\n" /* sign adjust before shift */ \ "psubw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "psubw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x04, %%mm2\n" /* sign adjust before shift */ \ "psraw $0x04, %%mm3\n" /* sign adjust before shift */ \ "paddw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "paddw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x04, %%mm0\n" /* divide by 16 */ \ "psraw $0x04, %%mm1\n" /* divide by 16 */ \ "pcmpeqw %%mm7, %%mm2\n" /* invert sign */ \ "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \ "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \ "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \ "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \ "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \ "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \ "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTER_LOCAL_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTER_LOCAL_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTER_LOCAL_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTER_LOCAL_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTER_LOCAL_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTER_LOCAL_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTER_LOCAL_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTER_LOCAL_STEP(7) DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory");}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -