📄 dequantize_mmx.h
字号:
/* libfame - Fast Assembly MPEG Encoder Library Copyright (C) 2000-2001 Vivien Chappelier This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*//********************** MMX accelerated dequantisation *************************/#define DEQUANTIZE_PRESCALE_STEP(x) \ "movq %%mm0, %%mm4\n" /* mm4 = mm0 */ \ "movq %%mm1, %%mm5\n" /* mm5 = mm1 */ \ "pmulhw 0x" #x "0(%3), %%mm0\n" /* premultiply for iDCT */ \ "pmulhw 0x" #x "8(%3), %%mm1\n" /* premultiply for iDCT */ \ "pmullw 0x" #x "0(%3), %%mm4\n" /* premultiply for iDCT */ \ "pmullw 0x" #x "8(%3), %%mm5\n" /* premultiply for iDCT */ \ "psrlw $0x0b, %%mm4\n" /* keep 5 bits */ \ "psrlw $0x0b, %%mm5\n" /* keep 5 bits */ \ "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ \ "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 */ \ "psrlw $0x01, %%mm4\n" /* keep 4 bits rounded */ \ "psrlw $0x01, %%mm5\n" /* keep 4 bits rounded */ \ "psllw $0x04, %%mm0\n" /* multiply by 16 for iDCT */ \ "psllw $0x04, %%mm1\n" /* multiply by 16 for iDCT */ \ "paddsw %%mm4, %%mm0\n" /* add least significant part */ \ "paddsw %%mm5, %%mm1\n" /* add least significant part */ \ "movq %%mm0, 0x" #x "0(%2)\n" /* store in cache */ \ "movq %%mm1, 0x" #x "8(%2)\n" /* store in cache */#define DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() \ "movq %%mm6, %%mm5\n" /* copy mismatch */ \ "psllq $0x20, %%mm5\n" /* mm5 = higher 32 bits */ \ "pxor %%mm6, %%mm5\n" /* sum mismatch */ \ "movq %%mm5, %%mm4\n" /* copy mismatch */ \ "psllq $0x10, %%mm5\n" /* mm5 = higher 16 bits */ \ "movq %%mm1, %%mm6\n" /* copy last line */ \ "pxor %%mm5, %%mm4\n" /* sum mismatch */ \ "movq %%mm7, %%mm3\n" /* mm3 = mm7 */ \ "pcmpeqw %%mm7, %%mm3\n" /* mm3 = 0xffffffffffffffff */ \ "psllq $0x3f, %%mm3\n" /* mm3 = 0x8000000000000000 */ \ "psrlq $0x0f, %%mm3\n" /* mm3 = 0x0001000000000000 */ \ "pxor %%mm3, %%mm6\n" /* temp last coeff ^= 1 */ \ "pand %%mm3, %%mm4\n" /* keep only lsb of mismatch */ \ "pxor %%mm4, %%mm6\n" /* temp last coeff ^= !(mismatch&1) */ \ "psubsw %%mm1, %%mm6\n" /* mismatch = temp last coeff - last coeff */ \ "psrlq $0x30, %%mm6\n" /* retrieve mismatch in lower word */static void inline dequantize_intra_global(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch){ unsigned int m;#define DEQUANTIZE_INTRA_GLOBAL_STEP(x) \ "movq 0x" #x "0(%0), %%mm0\n" /* load 1st line 1st half */ \ "movq 0x" #x "8(%0), %%mm1\n" /* load 1st line 2nd half */ \ "movq %%mm0, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm1, %%mm3\n" /* mm3 = 1st line 1st half */ \ "psraw $0x0f, %%mm2\n" /* mm2 = (sign(mm0) - 1) / 2 */ \ "psraw $0x0f, %%mm3\n" /* mm3 = (sign(mm1) - 1) / 2 */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=[0-3]*Q */ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=[4-7]*Q */ \ "psllw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psllw $0x03, %%mm3\n" /* sign adjust before shift */ \ "psubw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "psubw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm3\n" /* sign adjust before shift */ \ "paddw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "paddw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm0\n" /* divide by 8 */ \ "psraw $0x03, %%mm1\n" /* divide by 8 */ \ "pxor %%mm0, %%mm6\n" /* accumulate mismatch */ \ "pxor %%mm1, %%mm6\n" /* accumulate mismatch */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTRA_GLOBAL_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTRA_GLOBAL_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTRA_GLOBAL_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTRA_GLOBAL_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTRA_GLOBAL_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTRA_GLOBAL_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTRA_GLOBAL_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTRA_GLOBAL_STEP(7) DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory"); asm volatile("movd %%mm6, %0\n" /* export mismatch */ : "=r"(m) : /* no input */ ); *mismatch = (dct_t) (m<<12);}static void inline dequantize_intra_local(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch /* not used */){ /* coeff[i] = (2*level[i]*qscale*matrix[i])/16 */ /* then coeff[i] = { coeff[i] + 1, if coeff[i] < 0 and coeff[i] is even */ /* { coeff[i] - 1, if coeff[i] > 0 and coeff[i] is even */ /* { coeff[i] otherwise */ /* implementation is */ /* coeff[i] = (level[i]*qscale*matrix[i]+(level[i]<0)?7:0)>>3 */ /* coeff[i] = (coeff[i]-(coeff[i]>0):1?0)|1 */#define DEQUANTIZE_INTRA_LOCAL_STEP(x) \ "movq 0x" #x "0(%0), %%mm0\n" /* load 1st line 1st half */ \ "movq 0x" #x "8(%0), %%mm1\n" /* load 1st line 2nd half */ \ "movq %%mm0, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm1, %%mm3\n" /* mm3 = 1st line 2nd half */ \ "psraw $0x0f, %%mm2\n" /* mm2 = (sign(mm0) - 1) / 2 */ \ "psraw $0x0f, %%mm3\n" /* mm3 = (sign(mm1) - 1) / 2 */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=[0-3]*Q */ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=[4-7]*Q */ \ "movq %%mm0, %%mm4\n" /* mm4 = mm0 */ \ "movq %%mm1, %%mm5\n" /* mm5 = mm1 */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm4[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm5[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm0[0-3]!=0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm1[0-3]!=0 */ \ "psllw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psllw $0x03, %%mm3\n" /* sign adjust before shift */ \ "psubw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "psubw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm3\n" /* sign adjust before shift */ \ "paddw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "paddw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm0\n" /* divide by 8 */ \ "psraw $0x03, %%mm1\n" /* divide by 8 */ \ "pcmpeqw %%mm7, %%mm2\n" /* invert sign */ \ "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \ "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \ "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \ "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \ "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \ "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \ "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTRA_LOCAL_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTRA_LOCAL_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTRA_LOCAL_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTRA_LOCAL_STEP(3) DEQUANTIZE_PRESCALE_STEP(3)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -