📄 paq7asm-x86_64.asm

📁 汇编语言写的压缩算法
💻 ASM
字号:
; YASM x86-64 assembly language code for PAQ7/8 ver. 2, Jan 18, 2007;; (C) 2005-2007, Matt Mahoney, Matthew Fite.; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt;; This code was tested on an Athlon-64 under Ubuntu Linux 2.6.15.27.amd64-generic; with paq8f and paq8jd.  It should work with any PAQ version since paq7,; because all versions use the same paq7asm.asm code for 32 bit Windows/Linux; versions.  To compile e.g. paq8jd in Linux:;;   yasm paq7asm-x86_64.asm -f elf -m amd64;   g++ -O3 -s -fomit-frame-pointer -DUNIX paq8jd.cpp paq7asm-x86_64.o -o paq8jd;; This code has not been tested in Windows.  (You would need XP Professional; 64 bit edition and a 64 bit compiler).section .textBITS 64; Vector product a*b of n signed words, returning signed dword scaled; down by 8 bits. n is rounded up to a multiple of 8.    global dot_product ; (short* a, short* b, int n)    align 16dot_product:    mov rcx, rdx        ; n    mov rax, rdi        ; a    mov rdx, rsi        ; b    add rcx, 7          ; n rounding up    and rcx, -8    jz .done    sub rax, 16    sub rdx, 16    pxor xmm0, xmm0     ; sum = 0.loop:                  ; each loop sums 4 products    movdqa xmm1, [rax+rcx*2] ; put parital sums of vector product in xmm1    pmaddwd xmm1, [rdx+rcx*2]    psrad xmm1, 8    paddd xmm0, xmm1    sub rcx, 8    ja .loop    movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax    psrldq xmm1, 8    paddd xmm0, xmm1    movdqa xmm1, xmm0    psrldq xmm1, 4    paddd xmm0, xmm1    movd rax, xmm0.done    ret; Train n neural network weights w[n] on inputs t[n] and err.; w[i] += (t[i]*err*2 >> 16)+1 >> 1 bounded to +- 32K.; n is rounded up to a multiple of 8.;1st arg rdi -> *t;2nd arg rsi -> *w;3rd arg rdx ->  n;4th arg rcx ->  err (signed 16 bits)    global train ; (short* t, short* w, int n, int err)    BITS 64    align 16train:    mov rax, rcx          ; err    and rax, 0xffff       ; put 8 copies of err in xmm0    movd xmm0, rax    movd xmm1, rax    pslldq xmm1, 2    por xmm0, xmm1    movdqa xmm1, xmm0    pslldq xmm1, 4    por xmm0, xmm1    movdqa xmm1, xmm0    pslldq xmm1, 8    por xmm0, xmm1;    pcmpeqb xmm1, xmm1    ; 8 copies of 1 in xmm1    psrlw xmm1, 15    mov rcx, rdx          ; n    mov rax, rdi          ; t    mov rdx, rsi          ; w    add rcx, 7            ; n/8 rounding up    and rcx, -8    sub rax, 16    sub rdx, 16    jz .done    align 16.loop:                     ; each iteration adjusts 8 weights    movdqa xmm2, [rdx+rcx*2] ; w[i]    movdqa xmm3, [rax+rcx*2] ; t[i]    paddsw xmm3, xmm3      ; t[i]*2    pmulhw xmm3, xmm0      ; t[i]*err*2 >> 16    paddsw xmm3, xmm1      ; (t[i]*err*2 >> 16)+1    psraw xmm3, 1          ; (t[i]*err*2 >> 16)+1 >> 1    paddsw xmm2, xmm3      ; w[i] + xmm3    movdqa [rdx+rcx*2], xmm2    sub rcx, 8    ja .loop.done:    ret
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -