📄 quantize_mpeg_xmm.asm
字号:
mov rsi, -14 mov rbx, rsp sub rsp, byte 24 ; 16 would be enough, but it isn't important lea rbx, [rsp+8] and rbx, byte -8 ;ALIGN 8 pxor mm0, mm0 pxor mm3, mm3 movq [byte rbx],mm0 movq [rbx+8],mm0 cmp rcx, byte 1 je near .q1loop cmp rcx, byte 19 jg near .lloopALIGN 16.loop movq mm1, [rax + 8*rsi+112] ; mm0 = [1st] psubw mm0, mm1 ;-mm1 movq mm4, [rax + 8*rsi + 120] ; psubw mm3, mm4 ;-mm4 pmaxsw mm0, mm1 ;|src| pmaxsw mm3, mm4; nop2 psraw mm1, 15 ;sign src psraw mm4, 15 psllw mm0, 4 ; level << 4 psllw mm3, 4 ; paddw mm0, [rdi + 640 + 8*rsi+112] paddw mm3, [rdi + 640 + 8*rsi+120] movq mm5, [rdi + 896 + 8*rsi+112] movq mm7, [rdi + 896 + 8*rsi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3; mov esp, esp movq mm2, [rdi + 512 + 8*rsi+112] movq mm6, [rdi + 512 + 8*rsi+120] pmullw mm2, mm5 pmullw mm6, mm7 psubw mm0, mm2 psubw mm3, mm6 movq mm2, [byte rbx] lea r11, [mmx_divs wrt rip] movq mm6, [r11 + rcx * 8 - 8] pmulhuw mm0, [rdi + 768 + 8*rsi+112] pmulhuw mm3, [rdi + 768 + 8*rsi+120] paddw mm2, [rbx+8] ;sum paddw mm5, mm0 paddw mm7, mm3 pxor mm0, mm0 pxor mm3, mm3 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32) add rsi, byte 2 paddw mm2, mm5 ;sum += x1 movq [rbx], mm7 ;store x2 pxor mm5, mm1 ; mm0 *= sign(mm0) pxor mm7, mm4 ; psubw mm5, mm1 ; undisplace psubw mm7, mm4 ;; db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum movq [rbx+8], mm2 ;store sum movq [rdx + 8*rsi+112-16], mm5 movq [rdx + 8*rsi +120-16], mm7 jng near .loop.done; calculate data[0] // (int32_t)dcscalar) paddw mm2, [rbx] add rsp, byte 24 pop rbx pmaddwd mm2, [mmx_one wrt rip] punpckldq mm0, mm2 ;get low dw to mm0:high paddd mm0,mm2 punpckhdq mm0, mm0 ;get result to low movd rax, mm0 retALIGN 16.q1loop movq mm1, [rax + 8*rsi+112] ; mm0 = [1st] psubw mm0, mm1 ;-mm1 movq mm4, [rax + 8*rsi+120] psubw mm3, mm4 ;-mm4 pmaxsw mm0, mm1 ;|src| pmaxsw mm3, mm4; nop2 psraw mm1, 15 ; sign src psraw mm4, 15 psllw mm0, 4 ; level << 4 psllw mm3, 4 paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided paddw mm3, [rdi + 640 + 8*rsi+120] ; inter1 contains fix for division by 1 movq mm5, [rdi + 896 + 8*rsi+112] ;with rounding down movq mm7, [rdi + 896 + 8*rsi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3 ;mm7: first approx of division; mov esp, esp movq mm2, [rdi + 512 + 8*rsi+112] movq mm6, [rdi + 512 + 8*rsi+120] ; divs for q<=16 pmullw mm2, mm5 ;test value <= original pmullw mm6, mm7 psubw mm0, mm2 ;mismatch psubw mm3, mm6 movq mm2, [byte rbx] pmulhuw mm0, [rdi + 768 + 8*rsi+112] ;correction pmulhuw mm3, [rdi + 768 + 8*rsi+120] paddw mm2, [rbx+8] ;sum paddw mm5, mm0 ;final result paddw mm7, mm3 pxor mm0, mm0 pxor mm3, mm3 psrlw mm5, 1 ; (level ) /2 (quant = 1) psrlw mm7, 1 add rsi, byte 2 paddw mm2, mm5 ;sum += x1 movq [rbx], mm7 ;store x2 pxor mm5, mm1 ; mm0 *= sign(mm0) pxor mm7, mm4 ; psubw mm5, mm1 ; undisplace psubw mm7, mm4 ; movq [rbx+8], mm2 ;store sum movq [rdx + 8*rsi+112-16], mm5 movq [rdx + 8*rsi +120-16], mm7 jng near .q1loop jmp near .doneALIGN 8.lloop movq mm1, [rax + 8*rsi+112] ; mm0 = [1st] psubw mm0,mm1 ;-mm1 movq mm4, [rax + 8*rsi+120] psubw mm3,mm4 ;-mm4 pmaxsw mm0,mm1 ;|src| pmaxsw mm3,mm4; nop2 psraw mm1,15 ;sign src psraw mm4,15 psllw mm0, 4 ; level << 4 psllw mm3, 4 ; paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided inter1 contains fix for division by 1 paddw mm3, [rdi + 640 + 8*rsi+120] movq mm5,[rdi + 896 + 8*rsi+112] movq mm7,[rdi + 896 + 8*rsi+120] pmulhuw mm5,mm0 pmulhuw mm7,mm3 ;mm7: first approx of division; mov esp,esp movq mm2,[rdi + 512 + 8*rsi+112] movq mm6,[rdi + 512 + 8*rsi+120] pmullw mm2,mm5 ;test value <= original pmullw mm6,mm7 psubw mm0,mm2 ;mismatch psubw mm3,mm6 movq mm2,[byte rbx] lea r11, [mmx_div wrt rip] movq mm6,[r11 + rcx * 8 - 8] ; divs for q<=16 pmulhuw mm0,[rdi + 768 + 8*rsi+112] ;correction pmulhuw mm3,[rdi + 768 + 8*rsi+120] paddw mm2,[rbx+8] ;sum paddw mm5,mm0 ;final result paddw mm7,mm3 pxor mm0,mm0 pxor mm3,mm3 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32) add rsi,byte 2 psrlw mm5, 1 ; (level ) / (2*quant) paddw mm2,mm5 ;sum += x1 psrlw mm7, 1 movq [rbx],mm7 ;store x2 pxor mm5, mm1 ; mm0 *= sign(mm0) pxor mm7, mm4 ; psubw mm5, mm1 ; undisplace psubw mm7, mm4 ;; db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum movq [rbx+8], mm2 ;store sum movq [rdx + 8*rsi+112-16], mm5 movq [rdx + 8*rsi +120-16], mm7 jng near .lloop jmp near .done.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_mpeg_intra_x86_64(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);; Ported from the 32bit 3dne cousin;----------------------------------------------------------------------------- ; Note: in order to saturate 'easily', we pre-shift the quantifier ; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to ; build a saturating mask. It is non-zero only when an overflow occured. ; We thus avoid packing/unpacking toward double-word. ; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g., ; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not ; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a ; and quant in [1..31]. ;%macro DEQUANT4INTRAMMX 1 movq mm1, [byte rcx+ 16 * %1] ; mm0 = c = coeff[i] movq mm4, [rcx+ 16 * %1 +8] ; mm3 = c' = coeff[i+1] psubw mm0, mm1 psubw mm3, mm4 pmaxsw mm0, mm1 pmaxsw mm3, mm4 psraw mm1, 15 psraw mm4, 15%if %1 movq mm2, [rsp-16] movq mm7, [rsp-16]%endif pmullw mm2, [rdi + 16 * %1 ] ; matrix[i]*quant pmullw mm7, [rdi + 16 * %1 +8] ; matrix[i+1]*quant movq mm5, mm0 movq mm6, mm3 pmulhw mm0, mm2 ; high of coeff*(matrix*quant) pmulhw mm3, mm7 ; high of coeff*(matrix*quant) pmullw mm2, mm5 ; low of coeff*(matrix*quant) pmullw mm7, mm6 ; low of coeff*(matrix*quant) pcmpgtw mm0, [rsp-8] pcmpgtw mm3, [rsp-8] paddusw mm2, mm0 paddusw mm7, mm3 psrlw mm2, 5 psrlw mm7, 5 pxor mm2, mm1 ; start negating back pxor mm7, mm4 ; start negating back psubusw mm1, mm0 psubusw mm4, mm3 movq mm0, [rsp-8] movq mm3, [rsp-8] psubw mm2, mm1 ; finish negating back psubw mm7, mm4 ; finish negating back movq [byte rdx + 16 * %1], mm2 ; data[i] movq [rdx + 16 * %1 +8], mm7 ; data[i+1]%endmacroALIGN 16dequant_mpeg_intra_x86_64: mov rax, rdx ; quant mov rdx, rdi ; data mov r9, rcx ; dcscalar mov rcx, rsi ; coeff lea r11, [mmx_mul_quant wrt rip] movq mm7, [r11 + rax*8 - 8] psllw mm7, 2 ; << 2. See comment. push rbx movsx ebx, word [rcx] pxor mm0, mm0 pxor mm3, mm3 movq [rsp-8], mm0 movq [rsp-16], mm7 imul ebx, r9d movq mm2, mm7 mov rdi, r8 ; mpeg_quant_matricesALIGN 4 DEQUANT4INTRAMMX 0 mov esi, -2048; nop cmp ebx, esi DEQUANT4INTRAMMX 1 cmovl ebx, esi neg esi sub esi, byte 1 ;2047 DEQUANT4INTRAMMX 2 cmp ebx, esi cmovg ebx, esi DEQUANT4INTRAMMX 3 mov [byte rdx], bx DEQUANT4INTRAMMX 4 DEQUANT4INTRAMMX 5 DEQUANT4INTRAMMX 6 DEQUANT4INTRAMMX 7 pop rbx xor rax, rax ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_mpeg_inter_3dne(int16_t * data,; const int16_t * const coeff,; const uint32_t quant,; const uint16_t *mpeg_matrices);; Ported from 32bit 3dne cousin;----------------------------------------------------------------------------- ; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier ; so we handle the 3 cases: c<0, c==0, and c>0 in one shot. ; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0. ; It's mixed with the extraction of the absolute value.ALIGN 16dequant_mpeg_inter_x86_64: mov rax, rdx ; quant mov rdx, rdi ; data mov rdi, rcx ; mpeg_matrices mov rcx, rsi ; coeff lea r11, [mmx_mul_quant wrt rip] movq mm7, [r11 + rax*8 - 8] mov rax, -14 paddw mm7, mm7 ; << 1 pxor mm6, mm6 ; mismatch sum pxor mm1, mm1 pxor mm3, mm3ALIGN 16.loop movq mm0, [rcx+8*rax + 7*16 ] ; mm0 = coeff[i] pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved) movq mm2, [rcx+8*rax + 7*16 +8] ; mm2 = coeff[i+1] pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved) paddsw mm0, mm1 ; c += sgn(c) paddsw mm2, mm3 ; c += sgn(c') paddw mm0, mm0 ; c *= 2 paddw mm2, mm2 ; c'*= 2 movq mm4, [mmzero wrt rip] movq mm5, [mmzero wrt rip] psubw mm4, mm0 ; -c psubw mm5, mm2 ; -c' psraw mm4, 16 ; mm4 = sgn(-c) psraw mm5, 16 ; mm5 = sgn(-c') psubsw mm0, mm4 ; c -= sgn(-c) psubsw mm2, mm5 ; c' -= sgn(-c') pxor mm0, mm1 ; finish changing sign if needed pxor mm2, mm3 ; finish changing sign if needed ; we're short on register, here. Poor pairing... movq mm4, mm7 ; (matrix*quant); nop pmullw mm4, [rdi + 512 + 8*rax + 7*16] movq mm5, mm4 pmulhw mm5, mm0 ; high of c*(matrix*quant) pmullw mm0, mm4 ; low of c*(matrix*quant) movq mm4, mm7 ; (matrix*quant) pmullw mm4, [rdi + 512 + 8*rax + 7*16 + 8] add rax, byte 2 pcmpgtw mm5, [mmzero wrt rip] paddusw mm0, mm5 psrlw mm0, 5 pxor mm0, mm1 ; start restoring sign psubusw mm1, mm5 movq mm5, mm4 pmulhw mm5, mm2 ; high of c*(matrix*quant) pmullw mm2, mm4 ; low of c*(matrix*quant) psubw mm0, mm1 ; finish restoring sign pcmpgtw mm5, [mmzero wrt rip] paddusw mm2, mm5 psrlw mm2, 5 pxor mm2, mm3 ; start restoring sign psubusw mm3, mm5 psubw mm2, mm3 ; finish restoring sign movq mm1, [mmzero wrt rip] movq mm3, [byte mmzero wrt rip] pxor mm6, mm0 ; mismatch control movq [rdx + 8*rax + 7*16 -2*8 ], mm0 ; data[i] pxor mm6, mm2 ; mismatch control movq [rdx + 8*rax + 7*16 -2*8 +8], mm2 ; data[i+1] jng .loop; nop ; mismatch control pshufw mm0, mm6, 01010101b pshufw mm1, mm6, 10101010b pshufw mm2, mm6, 11111111b pxor mm6, mm0 pxor mm1, mm2 pxor mm6, mm1 movd rax, mm6 and rax, byte 1 xor rax, byte 1 xor word [rdx + 2*63], ax xor rax, rax ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -