📄 k6opt.s
字号:
.section .rodata
.align 4
.type extremes,@object
.size extremes,8
extremes:
.long 0x80008000
.long 0x7fff7fff
.type ones,@object
.size ones,8
ones:
.long 0x00010001
.long 0x00010001
.text
.align 4
/* long k6maxmin (const short *p, int n, short *out) */
.globl k6maxmin
.type k6maxmin,@function
k6maxmin:
pushl %ebp
movl %esp,%ebp
pushl %esi
emms
movl 8(%ebp),%esi
movl 12(%ebp),%eax
leal -8(%esi,%eax,2),%edx
cmpl %edx,%esi
jbe .L52
movd extremes,%mm0
movd extremes+4,%mm1
jmp .L58
.p2align 2
.L52:
movq (%esi),%mm0 /* mm0 will be max's */
movq %mm0,%mm1 /* mm1 will be min's */
addl $8,%esi
cmpl %edx,%esi
ja .L56
.p2align 2
.L54:
movq (%esi),%mm2
movq %mm2,%mm3
pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
movq %mm3,%mm4
pand %mm2,%mm3 /* mm3 is mm2 masked to new max's */
pandn %mm0,%mm4 /* mm4 is mm0 masked to its max's */
por %mm3,%mm4
movq %mm4,%mm0 /* now mm0 is updated max's */
movq %mm1,%mm3
pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
por %mm3,%mm2
movq %mm2,%mm1 /* now mm1 is updated min's */
addl $8,%esi
cmpl %edx,%esi
jbe .L54
.p2align 2
.L56: /* merge down the 4-word max/mins to lower 2 words */
movq %mm0,%mm2
psrlq $32,%mm2
movq %mm2,%mm3
pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
pand %mm3,%mm2 /* mm2 is mm2 masked to new max's */
pandn %mm0,%mm3 /* mm3 is mm0 masked to its max's */
por %mm3,%mm2
movq %mm2,%mm0 /* now mm0 is updated max's */
movq %mm1,%mm2
psrlq $32,%mm2
movq %mm1,%mm3
pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
por %mm3,%mm2
movq %mm2,%mm1 /* now mm1 is updated min's */
.p2align 2
.L58:
addl $4,%edx /* now dx = top-4 */
cmpl %edx,%esi
ja .L62
/* here, there are >= 2 words of input remaining */
movd (%esi),%mm2
movq %mm2,%mm3
pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
movq %mm3,%mm4
pand %mm2,%mm3 /* mm3 is mm2 masked to new max's */
pandn %mm0,%mm4 /* mm4 is mm0 masked to its max's */
por %mm3,%mm4
movq %mm4,%mm0 /* now mm0 is updated max's */
movq %mm1,%mm3
pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
por %mm3,%mm2
movq %mm2,%mm1 /* now mm1 is updated min's */
addl $4,%esi
.p2align 2
.L62:
/* merge down the 2-word max/mins to 1 word */
movq %mm0,%mm2
psrlq $16,%mm2
movq %mm2,%mm3
pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
pand %mm3,%mm2 /* mm2 is mm2 masked to new max's */
pandn %mm0,%mm3 /* mm3 is mm0 masked to its max's */
por %mm3,%mm2
movd %mm2,%ecx /* cx is max so far */
movq %mm1,%mm2
psrlq $16,%mm2
movq %mm1,%mm3
pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
por %mm3,%mm2
movd %mm2,%eax /* ax is min so far */
addl $2,%edx /* now dx = top-2 */
cmpl %edx,%esi
ja .L65
/* here, there is one word of input left */
cmpw (%esi),%cx
jge .L64
movw (%esi),%cx
.p2align 2
.L64:
cmpw (%esi),%ax
jle .L65
movw (%esi),%ax
.p2align 2
.L65: /* (finally!) cx is the max, ax the min */
movswl %cx,%ecx
movswl %ax,%eax
movl 16(%ebp),%edx /* ptr to output max,min vals */
andl %edx,%edx; jz .L77
movw %cx,(%edx) /* max */
movw %ax,2(%edx) /* min */
.p2align 2
.L77:
/* now calculate max absolute val */
negl %eax
cmpl %ecx,%eax
jge .L81
movl %ecx,%eax
.p2align 2
.L81:
emms
popl %esi
leave
ret
.Lfe6:
.size k6maxmin,.Lfe6-k6maxmin
/* void Short_term_analysis_filtering (short *u0, const short *rp0, int kn, short *s) */
.equiv pm_u0,8
.equiv pm_rp0,12
.equiv pm_kn,16
.equiv pm_s,20
.equiv lv_u_top,-4
.equiv lv_s_top,-8
.equiv lv_rp,-40 /* local version of rp0 with each word twice */
.align 4
.globl Short_term_analysis_filteringx
.type Short_term_analysis_filteringx,@function
Short_term_analysis_filteringx:
pushl %ebp
movl %esp,%ebp
subl $40,%esp
pushl %edi
pushl %esi
movl pm_rp0(%ebp),%esi;
leal lv_rp(%ebp),%edi;
cld
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
emms
movl $0x4000,%eax;
movd %eax,%mm4;
punpckldq %mm4,%mm4 /* (0x00004000,0x00004000) for rounding dword product pairs */
movl pm_u0(%ebp),%eax
addl $16,%eax
movl %eax,lv_u_top(%ebp) /* UTOP */
movl pm_s(%ebp),%edx /* edx is local s ptr throughout below */
movl pm_kn(%ebp),%eax
leal (%edx,%eax,2),%eax
movl %eax,lv_s_top(%ebp)
cmpl %eax,%edx
jae .L179
.p2align 2
.L181:
leal lv_rp(%ebp),%esi /* RP */
movl pm_u0(%ebp),%edi /* U */
movw (%edx),%ax /* (0,DI) */
roll $16,%eax
movw (%edx),%ax /* (DI,DI) */
.p2align 2
.L185: /* RP is %esi */
movl %eax,%ecx
movw (%edi),%ax /* (DI,U) */
movd (%esi),%mm3 /* mm3 is (0,0,RP,RP) */
movw %cx,(%edi)
movd %eax,%mm2 /* mm2 is (0,0,DI,U) */
rorl $16,%eax
movd %eax,%mm1 /* mm1 is (0,0,U,DI) */
movq %mm1,%mm0
pmullw %mm3,%mm0
pmulhw %mm3,%mm1
punpcklwd %mm1,%mm0 /* mm0 is (RP*U,RP*DI) */
paddd %mm4,%mm0 /* mm4 is 0x00004000,0x00004000 */
psrad $15,%mm0 /* (RP*U,RP*DI) adjusted */
packssdw %mm0,%mm0 /* (*,*,RP*U,RP*DI) adjusted and saturated to word */
paddsw %mm2,%mm0 /* mm0 is (?,?, DI', U') */
movd %mm0,%eax /* (DI,U') */
addl $2,%edi
addl $4,%esi
cmpl lv_u_top(%ebp),%edi
jb .L185
rorl $16,%eax
movw %ax,(%edx) /* last DI goes to *s */
addl $2,%edx /* next s */
cmpl lv_s_top(%ebp),%edx
jb .L181
.p2align 2
.L179:
emms
popl %esi
popl %edi
leave
ret
.Lfe7:
.size Short_term_analysis_filteringx,.Lfe7-Short_term_analysis_filteringx
.end
/* 'as' macro's seem to be case-insensitive */
.macro STEP n
.if \n
movd \n(%esi),%mm3 /* mm3 is (0,0,RP,RP) */
.else
movd (%esi),%mm3 /* mm3 is (0,0,RP,RP) */
.endif
movq %mm5,%mm1;
movd %mm4,%ecx; movw %cx,%ax /* (DI,U) */
psllq $48,%mm1; psrlq $16,%mm4; por %mm1,%mm4
psllq $48,%mm0; psrlq $16,%mm5; por %mm0,%mm5
movd %eax,%mm2 /* mm2 is (0,0,DI,U) */
rorl $16,%eax
movd %eax,%mm1 /* mm1 is (0,0,U,DI) */
movq %mm1,%mm0
pmullw %mm3,%mm0
pmulhw %mm3,%mm1
punpcklwd %mm1,%mm0 /* mm0 is (RP*U,RP*DI) */
paddd %mm6,%mm0 /* mm6 is 0x00004000,0x00004000 */
psrad $15,%mm0 /* (RP*U,RP*DI) adjusted */
packssdw %mm0,%mm0 /* (*,*,RP*U,RP*DI) adjusted and saturated to word */
paddsw %mm2,%mm0 /* mm0 is (?,?, DI', U') */
movd %mm0,%eax /* (DI,U') */
.endm
/* void Short_term_analysis_filtering (short *u0, const short *rp0, int kn, short *s) */
.equiv pm_u0,8
.equiv pm_rp0,12
.equiv pm_kn,16
.equiv pm_s,20
.equiv lv_rp_top,-4
.equiv lv_s_top,-8
.equiv lv_rp,-40 /* local version of rp0 with each word twice */
.align 4
.globl Short_term_analysis_filteringx
.type Short_term_analysis_filteringx,@function
Short_term_analysis_filteringx:
pushl %ebp
movl %esp,%ebp
subl $56,%esp
pushl %edi
pushl %esi
pushl %ebx
movl pm_rp0(%ebp),%esi;
leal lv_rp(%ebp),%edi;
cld
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
lodsw; stosw; stosw
movl %edi,lv_rp_top(%ebp)
emms
movl $0x4000,%eax;
movd %eax,%mm6;
punpckldq %mm6,%mm6 /* (0x00004000,0x00004000) for rounding dword product pairs */
movl pm_u0(%ebp),%ebx
movq (%ebx),%mm4; movq 8(%ebx),%mm5 /* the 8 u's */
movl pm_s(%ebp),%edx /* edx is local s ptr throughout below */
movl pm_kn(%ebp),%eax
leal (%edx,%eax,2),%eax
movl %eax,lv_s_top(%ebp)
cmpl %eax,%edx
jae .L179
.p2align 2
.L181:
leal lv_rp(%ebp),%esi /* RP */
movw (%edx),%ax /* (0,DI) */
roll $16,%eax
movw (%edx),%ax /* (DI,DI) */
movd %eax,%mm0
.p2align 2
.L185: /* RP is %esi */
step 0
step 4
step 8
step 12
/*
step 16
step 20
step 24
step 28
*/
addl $16,%esi
cmpl lv_rp_top(%ebp),%esi
jb .L185
rorl $16,%eax
movw %ax,(%edx) /* last DI goes to *s */
addl $2,%edx /* next s */
cmpl lv_s_top(%ebp),%edx
jb .L181
.L179:
movq %mm4,(%ebx); movq %mm5,8(%ebx) /* the 8 u's */
emms
popl %ebx
popl %esi
popl %edi
leave
ret
.Lfe7:
.size Short_term_analysis_filteringx,.Lfe7-Short_term_analysis_filteringx
.ident "GCC: (GNU) 2.95.2 19991109 (Debian GNU/Linux)"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -