⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quantize_mpeg_xmm.asm

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 ASM
📖 第 1 页 / 共 2 页
字号:
  mov rsi, -14  mov rbx, rsp  sub rsp, byte 24	; 16 would be enough, but it isn't important  lea rbx, [rsp+8]  and rbx, byte -8 ;ALIGN 8  pxor mm0, mm0  pxor mm3, mm3  movq [byte rbx],mm0  movq [rbx+8],mm0  cmp rcx, byte 1  je near .q1loop  cmp rcx, byte 19  jg near .lloopALIGN 16.loop  movq mm1, [rax + 8*rsi+112]       ; mm0 = [1st]  psubw mm0, mm1 ;-mm1  movq mm4, [rax + 8*rsi + 120] ;  psubw mm3, mm4 ;-mm4  pmaxsw mm0, mm1 ;|src|  pmaxsw mm3, mm4;  nop2  psraw mm1, 15         ;sign src  psraw mm4, 15  psllw mm0, 4          ; level << 4  psllw mm3, 4          ;  paddw mm0, [rdi + 640 + 8*rsi+112]  paddw mm3, [rdi + 640 + 8*rsi+120]  movq mm5, [rdi + 896 + 8*rsi+112]  movq mm7, [rdi + 896 + 8*rsi+120]  pmulhuw mm5, mm0  pmulhuw mm7, mm3;  mov esp, esp  movq mm2, [rdi + 512 + 8*rsi+112]  movq mm6, [rdi + 512 + 8*rsi+120]  pmullw mm2, mm5  pmullw mm6, mm7  psubw mm0, mm2  psubw mm3, mm6  movq mm2, [byte rbx]  lea r11, [mmx_divs wrt rip]  movq mm6, [r11 + rcx * 8 - 8]  pmulhuw mm0, [rdi + 768 + 8*rsi+112]  pmulhuw mm3, [rdi + 768 + 8*rsi+120]  paddw mm2, [rbx+8]    ;sum  paddw mm5, mm0  paddw mm7, mm3  pxor mm0, mm0  pxor mm3, mm3  pmulhuw mm5, mm6      ; mm0 = (mm0 / 2Q) >> 16  pmulhuw mm7, mm6      ;  (level ) / quant (0<quant<32)  add rsi, byte 2  paddw mm2, mm5        ;sum += x1  movq [rbx], mm7       ;store x2  pxor mm5, mm1         ; mm0 *= sign(mm0)  pxor mm7, mm4         ;  psubw mm5, mm1        ; undisplace  psubw mm7, mm4        ;;  db 0Fh, 7Fh, 54h, 23h, 08 ;movq   [ebx+8],mm2 ;store sum  movq [rbx+8], mm2 ;store sum  movq [rdx + 8*rsi+112-16], mm5  movq [rdx + 8*rsi +120-16], mm7  jng near .loop.done; calculate  data[0] // (int32_t)dcscalar)  paddw mm2, [rbx]  add rsp, byte 24  pop rbx  pmaddwd mm2, [mmx_one wrt rip]  punpckldq mm0, mm2 ;get low dw to mm0:high  paddd mm0,mm2  punpckhdq mm0, mm0 ;get result to low  movd rax, mm0  retALIGN 16.q1loop  movq mm1, [rax + 8*rsi+112]       ; mm0 = [1st]  psubw mm0, mm1                    ;-mm1  movq mm4, [rax + 8*rsi+120]  psubw mm3, mm4                    ;-mm4  pmaxsw mm0, mm1                   ;|src|  pmaxsw mm3, mm4;  nop2  psraw mm1, 15                             ; sign src  psraw mm4, 15  psllw mm0, 4                              ; level << 4  psllw mm3, 4  paddw mm0, [rdi + 640 + 8*rsi+112]    ;mm0 is to be divided  paddw mm3, [rdi + 640 + 8*rsi+120]    ; inter1 contains fix for division by 1  movq mm5, [rdi + 896 + 8*rsi+112] ;with rounding down  movq mm7, [rdi + 896 + 8*rsi+120]  pmulhuw mm5, mm0  pmulhuw mm7, mm3                          ;mm7: first approx of division;  mov esp, esp  movq mm2, [rdi + 512 + 8*rsi+112]  movq mm6, [rdi + 512 + 8*rsi+120]      ; divs for q<=16  pmullw mm2, mm5                           ;test value <= original  pmullw mm6, mm7  psubw mm0, mm2                            ;mismatch  psubw mm3, mm6  movq mm2, [byte rbx]  pmulhuw mm0, [rdi + 768 + 8*rsi+112]  ;correction  pmulhuw mm3, [rdi + 768 + 8*rsi+120]  paddw mm2, [rbx+8]    ;sum  paddw mm5, mm0        ;final result  paddw mm7, mm3  pxor mm0, mm0  pxor mm3, mm3  psrlw mm5, 1          ;  (level ) /2  (quant = 1)  psrlw mm7, 1  add rsi, byte 2  paddw mm2, mm5        ;sum += x1  movq [rbx], mm7       ;store x2  pxor mm5, mm1         ; mm0 *= sign(mm0)  pxor mm7, mm4         ;  psubw mm5, mm1        ; undisplace  psubw mm7, mm4        ;  movq [rbx+8], mm2     ;store sum  movq [rdx + 8*rsi+112-16], mm5  movq [rdx + 8*rsi +120-16], mm7  jng near .q1loop  jmp near .doneALIGN 8.lloop  movq mm1, [rax + 8*rsi+112]       ; mm0 = [1st]  psubw mm0,mm1         ;-mm1  movq mm4, [rax + 8*rsi+120]  psubw mm3,mm4         ;-mm4  pmaxsw mm0,mm1        ;|src|  pmaxsw mm3,mm4;  nop2  psraw mm1,15          ;sign src  psraw mm4,15  psllw mm0, 4          ; level << 4  psllw mm3, 4          ;  paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided inter1 contains fix for division by 1  paddw mm3, [rdi + 640 + 8*rsi+120]  movq mm5,[rdi + 896 + 8*rsi+112]  movq mm7,[rdi + 896 + 8*rsi+120]  pmulhuw mm5,mm0  pmulhuw mm7,mm3       ;mm7: first approx of division;  mov esp,esp  movq mm2,[rdi + 512 + 8*rsi+112]  movq mm6,[rdi + 512 + 8*rsi+120]  pmullw mm2,mm5        ;test value <= original  pmullw mm6,mm7  psubw mm0,mm2         ;mismatch  psubw mm3,mm6  movq mm2,[byte rbx]  lea r11, [mmx_div wrt rip]  movq mm6,[r11 + rcx * 8 - 8]  ; divs for q<=16  pmulhuw mm0,[rdi + 768 + 8*rsi+112] ;correction  pmulhuw mm3,[rdi + 768 + 8*rsi+120]  paddw mm2,[rbx+8]     ;sum  paddw mm5,mm0         ;final result  paddw mm7,mm3  pxor mm0,mm0  pxor mm3,mm3  pmulhuw mm5, mm6      ; mm0 = (mm0 / 2Q) >> 16  pmulhuw mm7, mm6      ;  (level ) / quant (0<quant<32)  add rsi,byte 2  psrlw mm5, 1          ; (level ) / (2*quant)  paddw mm2,mm5         ;sum += x1  psrlw mm7, 1  movq [rbx],mm7        ;store x2  pxor mm5, mm1         ; mm0 *= sign(mm0)  pxor mm7, mm4         ;  psubw mm5, mm1        ; undisplace  psubw mm7, mm4        ;;  db 0Fh, 7Fh, 54h, 23h, 08 ;movq   [ebx+8],mm2 ;store sum  movq [rbx+8], mm2 ;store sum  movq [rdx + 8*rsi+112-16], mm5  movq [rdx + 8*rsi +120-16], mm7  jng near .lloop  jmp near .done.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_mpeg_intra_x86_64(int16_t *data,;                                  const int16_t const *coeff,;                                  const uint32_t quant,;                                  const uint32_t dcscalar,;                                  const uint16_t *mpeg_matrices);; Ported from the 32bit 3dne cousin;-----------------------------------------------------------------------------  ;   Note: in order to saturate 'easily', we pre-shift the quantifier  ; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to  ; build a saturating mask. It is non-zero only when an overflow occured.  ; We thus avoid packing/unpacking toward double-word.  ; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,  ; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not  ; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a  ; and quant in [1..31].  ;%macro DEQUANT4INTRAMMX 1  movq mm1, [byte rcx+ 16 * %1] ; mm0 = c  = coeff[i]  movq mm4, [rcx+ 16 * %1 +8]   ; mm3 = c' = coeff[i+1]  psubw mm0, mm1  psubw mm3, mm4  pmaxsw mm0, mm1  pmaxsw mm3, mm4  psraw mm1, 15  psraw mm4, 15%if %1  movq mm2, [rsp-16]  movq mm7, [rsp-16]%endif  pmullw mm2, [rdi + 16 * %1 ]     ; matrix[i]*quant  pmullw mm7, [rdi + 16 * %1 +8]   ; matrix[i+1]*quant  movq mm5, mm0  movq mm6, mm3  pmulhw mm0, mm2   ; high of coeff*(matrix*quant)  pmulhw mm3, mm7   ; high of coeff*(matrix*quant)  pmullw mm2, mm5   ; low  of coeff*(matrix*quant)  pmullw mm7, mm6   ; low  of coeff*(matrix*quant)  pcmpgtw mm0, [rsp-8]  pcmpgtw mm3, [rsp-8]  paddusw mm2, mm0  paddusw mm7, mm3  psrlw mm2, 5  psrlw mm7, 5  pxor mm2, mm1     ; start negating back  pxor mm7, mm4     ; start negating back  psubusw mm1, mm0  psubusw mm4, mm3  movq mm0, [rsp-8]  movq mm3, [rsp-8]  psubw mm2, mm1    ; finish negating back  psubw mm7, mm4    ; finish negating back  movq [byte rdx + 16 * %1], mm2   ; data[i]  movq [rdx + 16 * %1  +8], mm7   ; data[i+1]%endmacroALIGN 16dequant_mpeg_intra_x86_64:  mov rax, rdx		; quant  mov rdx, rdi		; data  mov r9, rcx		; dcscalar  mov rcx, rsi		; coeff  lea r11, [mmx_mul_quant wrt rip]  movq mm7, [r11  + rax*8 - 8]  psllw mm7, 2      ; << 2. See comment.  push rbx  movsx ebx, word [rcx]  pxor mm0, mm0  pxor mm3, mm3  movq [rsp-8], mm0  movq [rsp-16], mm7  imul ebx, r9d  movq mm2, mm7  mov rdi, r8				; mpeg_quant_matricesALIGN 4  DEQUANT4INTRAMMX 0  mov esi, -2048;  nop  cmp ebx, esi  DEQUANT4INTRAMMX 1  cmovl ebx, esi  neg esi  sub esi, byte 1 ;2047  DEQUANT4INTRAMMX 2  cmp ebx, esi  cmovg ebx, esi  DEQUANT4INTRAMMX 3  mov [byte rdx], bx  DEQUANT4INTRAMMX 4  DEQUANT4INTRAMMX 5  DEQUANT4INTRAMMX 6  DEQUANT4INTRAMMX 7  pop rbx  xor rax, rax  ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_mpeg_inter_3dne(int16_t * data,;                                  const int16_t * const coeff,;                                  const uint32_t quant,;                                  const uint16_t *mpeg_matrices);; Ported from 32bit 3dne cousin;-----------------------------------------------------------------------------    ; Note:  We use (2*c + sgn(c) - sgn(-c)) as multiplier    ; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.    ; sgn(x) is the result of 'pcmpgtw 0,x':  0 if x>=0, -1 if x<0.    ; It's mixed with the extraction of the absolute value.ALIGN 16dequant_mpeg_inter_x86_64:  mov rax, rdx			; quant  mov rdx, rdi			; data  mov rdi, rcx			; mpeg_matrices  mov rcx, rsi			; coeff  lea r11, [mmx_mul_quant wrt rip]  movq mm7, [r11  + rax*8 - 8]  mov rax, -14  paddw mm7, mm7    ; << 1  pxor mm6, mm6     ; mismatch sum  pxor mm1, mm1  pxor mm3, mm3ALIGN 16.loop  movq mm0, [rcx+8*rax + 7*16   ]   ; mm0 = coeff[i]  pcmpgtw mm1, mm0  ; mm1 = sgn(c)    (preserved)  movq mm2, [rcx+8*rax + 7*16 +8]   ; mm2 = coeff[i+1]  pcmpgtw mm3, mm2  ; mm3 = sgn(c')   (preserved)  paddsw mm0, mm1   ; c += sgn(c)  paddsw mm2, mm3   ; c += sgn(c')  paddw mm0, mm0    ; c *= 2  paddw mm2, mm2    ; c'*= 2  movq mm4, [mmzero wrt rip]  movq mm5, [mmzero wrt rip]  psubw mm4, mm0    ; -c  psubw mm5, mm2    ; -c'  psraw mm4, 16     ; mm4 = sgn(-c)  psraw mm5, 16     ; mm5 = sgn(-c')  psubsw mm0, mm4   ; c  -= sgn(-c)  psubsw mm2, mm5   ; c' -= sgn(-c')  pxor mm0, mm1     ; finish changing sign if needed  pxor mm2, mm3     ; finish changing sign if needed ; we're short on register, here. Poor pairing...  movq mm4, mm7     ; (matrix*quant);  nop  pmullw mm4, [rdi + 512 + 8*rax + 7*16]  movq mm5, mm4  pmulhw mm5, mm0   ; high of c*(matrix*quant)  pmullw mm0, mm4   ; low  of c*(matrix*quant)  movq mm4, mm7     ; (matrix*quant)  pmullw mm4, [rdi + 512 + 8*rax + 7*16 + 8]  add rax, byte 2  pcmpgtw mm5, [mmzero wrt rip]  paddusw mm0, mm5  psrlw mm0, 5  pxor mm0, mm1     ; start restoring sign  psubusw mm1, mm5  movq mm5, mm4  pmulhw mm5, mm2   ; high of c*(matrix*quant)  pmullw mm2, mm4   ; low  of c*(matrix*quant)  psubw mm0, mm1    ; finish restoring sign  pcmpgtw mm5, [mmzero wrt rip]  paddusw mm2, mm5  psrlw mm2, 5  pxor mm2, mm3     ; start restoring sign  psubusw mm3, mm5  psubw mm2, mm3    ; finish restoring sign  movq mm1, [mmzero wrt rip]  movq mm3, [byte mmzero wrt rip]  pxor mm6, mm0                             ; mismatch control  movq [rdx + 8*rax + 7*16 -2*8   ], mm0    ; data[i]  pxor mm6, mm2                             ; mismatch control  movq [rdx + 8*rax + 7*16 -2*8 +8], mm2    ; data[i+1]  jng .loop;  nop ; mismatch control  pshufw mm0, mm6, 01010101b  pshufw mm1, mm6, 10101010b  pshufw mm2, mm6, 11111111b  pxor mm6, mm0  pxor mm1, mm2  pxor mm6, mm1  movd rax, mm6  and rax, byte 1  xor rax, byte 1  xor word [rdx + 2*63], ax  xor rax, rax  ret.endfunc

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -