📄 k6opt.s
字号:
.file "k6opt.s" .version "01.01"/* gcc2_compiled.: */.section .rodata .align 4 .type coefs,@object .size coefs,24coefs: .value -134 .value -374 .value 0 .value 2054 .value 5741 .value 8192 .value 5741 .value 2054 .value 0 .value -374 .value -134 .value 0.text .align 4/* void Weighting_filter (const short *e, short *x) */.globl Weighting_filter .type Weighting_filter,@functionWeighting_filter: pushl %ebp movl %esp,%ebp pushl %edi pushl %esi pushl %ebx movl 12(%ebp),%edi movl 8(%ebp),%ebx addl $-10,%ebx emms movl $0x1000,%eax; movd %eax,%mm5 /* for rounding */ movq coefs,%mm1 movq coefs+8,%mm2 movq coefs+16,%mm3 xorl %esi,%esi .p2align 2.L21: movq (%ebx,%esi,2),%mm0 pmaddwd %mm1,%mm0 movq 8(%ebx,%esi,2),%mm4 pmaddwd %mm2,%mm4 paddd %mm4,%mm0 movq 16(%ebx,%esi,2),%mm4 pmaddwd %mm3,%mm4 paddd %mm4,%mm0 movq %mm0,%mm4 punpckhdq %mm0,%mm4 /* mm4 has high int32 of mm0 dup'd */ paddd %mm4,%mm0; paddd %mm5,%mm0 /* add for roundoff */ psrad $13,%mm0 packssdw %mm0,%mm0 movd %mm0,%eax /* ax has result */ movw %ax,(%edi,%esi,2) incl %esi cmpl $39,%esi jle .L21 emms popl %ebx popl %esi popl %edi leave ret.Lfe1: .size Weighting_filter,.Lfe1-Weighting_filter.macro ccstep n.if \n movq \n(%edi),%mm1 movq \n(%esi),%mm2.else movq (%edi),%mm1 movq (%esi),%mm2.endif pmaddwd %mm2,%mm1 paddd %mm1,%mm0.endm .align 4/* long k6maxcc(const short *wt, const short *dp, short *Nc_out) */.globl k6maxcc .type k6maxcc,@functionk6maxcc: pushl %ebp movl %esp,%ebp pushl %edi pushl %esi pushl %ebx emms movl 8(%ebp),%edi movl 12(%ebp),%esi movl $0,%edx /* will be maximum inner-product */ movl $40,%ebx movl %ebx,%ecx /* will be index of max inner-product */ subl $80,%esi .p2align 2.L41: movq (%edi),%mm0 movq (%esi),%mm2 pmaddwd %mm2,%mm0 ccstep 8 ccstep 16 ccstep 24 ccstep 32 ccstep 40 ccstep 48 ccstep 56 ccstep 64 ccstep 72 movq %mm0,%mm1 punpckhdq %mm0,%mm1 /* mm1 has high int32 of mm0 dup'd */ paddd %mm1,%mm0; movd %mm0,%eax /* eax has result */ cmpl %edx,%eax jle .L40 movl %eax,%edx movl %ebx,%ecx .p2align 2.L40: subl $2,%esi incl %ebx cmpl $120,%ebx jle .L41 movl 16(%ebp),%eax movw %cx,(%eax) movl %edx,%eax emms popl %ebx popl %esi popl %edi leave ret.Lfe2: .size k6maxcc,.Lfe2-k6maxcc .align 4/* long k6iprod (const short *p, const short *q, int n) */.globl k6iprod .type k6iprod,@functionk6iprod: pushl %ebp movl %esp,%ebp pushl %edi pushl %esi emms pxor %mm0,%mm0 movl 8(%ebp),%esi movl 12(%ebp),%edi movl 16(%ebp),%eax leal -32(%esi,%eax,2),%edx /* edx = top - 32 */ cmpl %edx,%esi; ja .L202 .p2align 2.L201: ccstep 0 ccstep 8 ccstep 16 ccstep 24 addl $32,%esi addl $32,%edi cmpl %edx,%esi; jbe .L201 .p2align 2.L202: addl $24,%edx /* now edx = top-8 */ cmpl %edx,%esi; ja .L205 .p2align 2.L203: ccstep 0 addl $8,%esi addl $8,%edi cmpl %edx,%esi; jbe .L203 .p2align 2.L205: addl $4,%edx /* now edx = top-4 */ cmpl %edx,%esi; ja .L207 movd (%edi),%mm1 movd (%esi),%mm2 pmaddwd %mm2,%mm1 paddd %mm1,%mm0 addl $4,%esi addl $4,%edi .p2align 2.L207: addl $2,%edx /* now edx = top-2 */ cmpl %edx,%esi; ja .L209 movswl (%edi),%eax movd %eax,%mm1 movswl (%esi),%eax movd %eax,%mm2 pmaddwd %mm2,%mm1 paddd %mm1,%mm0 .p2align 2.L209: movq %mm0,%mm1 punpckhdq %mm0,%mm1 /* mm1 has high int32 of mm0 dup'd */ paddd %mm1,%mm0; movd %mm0,%eax /* eax has result */ emms popl %esi popl %edi leave ret.Lfe3: .size k6iprod,.Lfe3-k6iprod .align 4/* void k6vsraw P3((short *p, int n, int bits) */.globl k6vsraw .type k6vsraw,@functionk6vsraw: pushl %ebp movl %esp,%ebp pushl %esi movl 8(%ebp),%esi movl 16(%ebp),%ecx andl %ecx,%ecx; jle .L399 movl 12(%ebp),%eax leal -16(%esi,%eax,2),%edx /* edx = top - 16 */ emms movd %ecx,%mm3 movq ones,%mm2 psllw %mm3,%mm2; psrlw $1,%mm2 cmpl %edx,%esi; ja .L306 .p2align 2.L302: /* 8 words per iteration */ movq (%esi),%mm0 movq 8(%esi),%mm1 paddsw %mm2,%mm0 psraw %mm3,%mm0; paddsw %mm2,%mm1 psraw %mm3,%mm1; movq %mm0,(%esi) movq %mm1,8(%esi) addl $16,%esi cmpl %edx,%esi jbe .L302 .p2align 2.L306: addl $12,%edx /* now edx = top-4 */ cmpl %edx,%esi; ja .L310 .p2align 2.L308: /* do up to 6 words, two at a time */ movd (%esi),%mm0 paddsw %mm2,%mm0 psraw %mm3,%mm0; movd %mm0,(%esi) addl $4,%esi cmpl %edx,%esi jbe .L308 .p2align 2.L310: addl $2,%edx /* now edx = top-2 */ cmpl %edx,%esi; ja .L315 movzwl (%esi),%eax movd %eax,%mm0 paddsw %mm2,%mm0 psraw %mm3,%mm0; movd %mm0,%eax movw %ax,(%esi) .p2align 2.L315: emms.L399: popl %esi leave ret.Lfe4: .size k6vsraw,.Lfe4-k6vsraw .align 4/* void k6vsllw P3((short *p, int n, int bits) */.globl k6vsllw .type k6vsllw,@functionk6vsllw: pushl %ebp movl %esp,%ebp pushl %esi movl 8(%ebp),%esi movl 16(%ebp),%ecx andl %ecx,%ecx; jle .L499 movl 12(%ebp),%eax leal -16(%esi,%eax,2),%edx /* edx = top - 16 */ emms movd %ecx,%mm3 cmpl %edx,%esi; ja .L406 .p2align 2.L402: /* 8 words per iteration */ movq (%esi),%mm0 movq 8(%esi),%mm1 psllw %mm3,%mm0; psllw %mm3,%mm1; movq %mm0,(%esi) movq %mm1,8(%esi) addl $16,%esi cmpl %edx,%esi jbe .L402 .p2align 2.L406: addl $12,%edx /* now edx = top-4 */ cmpl %edx,%esi; ja .L410 .p2align 2.L408: /* do up to 6 words, two at a time */ movd (%esi),%mm0 psllw %mm3,%mm0; movd %mm0,(%esi) addl $4,%esi cmpl %edx,%esi jbe .L408 .p2align 2.L410: addl $2,%edx /* now edx = top-2 */ cmpl %edx,%esi; ja .L415 movzwl (%esi),%eax movd %eax,%mm0 psllw %mm3,%mm0; movd %mm0,%eax movw %ax,(%esi) .p2align 2.L415: emms.L499: popl %esi leave ret.Lfe5: .size k6vsllw,.Lfe5-k6vsllw.section .rodata .align 4 .type extremes,@object .size extremes,8extremes: .long 0x80008000 .long 0x7fff7fff .type ones,@object .size ones,8ones: .long 0x00010001 .long 0x00010001.text .align 4/* long k6maxmin (const short *p, int n, short *out) */.globl k6maxmin .type k6maxmin,@functionk6maxmin: pushl %ebp movl %esp,%ebp pushl %esi emms movl 8(%ebp),%esi movl 12(%ebp),%eax leal -8(%esi,%eax,2),%edx cmpl %edx,%esi jbe .L52 movd extremes,%mm0 movd extremes+4,%mm1 jmp .L58 .p2align 2.L52: movq (%esi),%mm0 /* mm0 will be max's */ movq %mm0,%mm1 /* mm1 will be min's */ addl $8,%esi cmpl %edx,%esi ja .L56 .p2align 2.L54: movq (%esi),%mm2 movq %mm2,%mm3 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */ movq %mm3,%mm4 pand %mm2,%mm3 /* mm3 is mm2 masked to new max's */ pandn %mm0,%mm4 /* mm4 is mm0 masked to its max's */ por %mm3,%mm4 movq %mm4,%mm0 /* now mm0 is updated max's */ movq %mm1,%mm3 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */ pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */ pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */ por %mm3,%mm2 movq %mm2,%mm1 /* now mm1 is updated min's */ addl $8,%esi cmpl %edx,%esi jbe .L54 .p2align 2.L56: /* merge down the 4-word max/mins to lower 2 words */ movq %mm0,%mm2 psrlq $32,%mm2 movq %mm2,%mm3 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */ pand %mm3,%mm2 /* mm2 is mm2 masked to new max's */ pandn %mm0,%mm3 /* mm3 is mm0 masked to its max's */ por %mm3,%mm2 movq %mm2,%mm0 /* now mm0 is updated max's */ movq %mm1,%mm2 psrlq $32,%mm2 movq %mm1,%mm3 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */ pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */ pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */ por %mm3,%mm2 movq %mm2,%mm1 /* now mm1 is updated min's */ .p2align 2.L58: addl $4,%edx /* now dx = top-4 */ cmpl %edx,%esi ja .L62 /* here, there are >= 2 words of input remaining */ movd (%esi),%mm2 movq %mm2,%mm3 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */ movq %mm3,%mm4 pand %mm2,%mm3 /* mm3 is mm2 masked to new max's */ pandn %mm0,%mm4 /* mm4 is mm0 masked to its max's */ por %mm3,%mm4 movq %mm4,%mm0 /* now mm0 is updated max's */ movq %mm1,%mm3 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */ pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */ pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */ por %mm3,%mm2 movq %mm2,%mm1 /* now mm1 is updated min's */ addl $4,%esi .p2align 2.L62: /* merge down the 2-word max/mins to 1 word */ movq %mm0,%mm2 psrlq $16,%mm2 movq %mm2,%mm3 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */ pand %mm3,%mm2 /* mm2 is mm2 masked to new max's */ pandn %mm0,%mm3 /* mm3 is mm0 masked to its max's */ por %mm3,%mm2 movd %mm2,%ecx /* cx is max so far */ movq %mm1,%mm2 psrlq $16,%mm2 movq %mm1,%mm3 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */ pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */ pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */ por %mm3,%mm2 movd %mm2,%eax /* ax is min so far */ addl $2,%edx /* now dx = top-2 */ cmpl %edx,%esi ja .L65 /* here, there is one word of input left */ cmpw (%esi),%cx jge .L64 movw (%esi),%cx .p2align 2.L64: cmpw (%esi),%ax jle .L65 movw (%esi),%ax .p2align 2.L65: /* (finally!) cx is the max, ax the min */ movswl %cx,%ecx movswl %ax,%eax movl 16(%ebp),%edx /* ptr to output max,min vals */ andl %edx,%edx; jz .L77 movw %cx,(%edx) /* max */ movw %ax,2(%edx) /* min */ .p2align 2.L77: /* now calculate max absolute val */ negl %eax cmpl %ecx,%eax jge .L81 movl %ecx,%eax .p2align 2.L81: emms popl %esi leave ret.Lfe6: .size k6maxmin,.Lfe6-k6maxmin/* void Short_term_analysis_filtering (short *u0, const short *rp0, int kn, short *s) */ .equiv pm_u0,8 .equiv pm_rp0,12 .equiv pm_kn,16 .equiv pm_s,20 .equiv lv_u_top,-4 .equiv lv_s_top,-8 .equiv lv_rp,-40 /* local version of rp0 with each word twice */ .align 4.globl Short_term_analysis_filteringx .type Short_term_analysis_filteringx,@functionShort_term_analysis_filteringx: pushl %ebp movl %esp,%ebp subl $40,%esp pushl %edi pushl %esi movl pm_rp0(%ebp),%esi; leal lv_rp(%ebp),%edi; cld lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw emms movl $0x4000,%eax; movd %eax,%mm4; punpckldq %mm4,%mm4 /* (0x00004000,0x00004000) for rounding dword product pairs */ movl pm_u0(%ebp),%eax addl $16,%eax movl %eax,lv_u_top(%ebp) /* UTOP */ movl pm_s(%ebp),%edx /* edx is local s ptr throughout below */ movl pm_kn(%ebp),%eax leal (%edx,%eax,2),%eax movl %eax,lv_s_top(%ebp) cmpl %eax,%edx jae .L179 .p2align 2.L181: leal lv_rp(%ebp),%esi /* RP */ movl pm_u0(%ebp),%edi /* U */ movw (%edx),%ax /* (0,DI) */ roll $16,%eax movw (%edx),%ax /* (DI,DI) */ .p2align 2.L185: /* RP is %esi */ movl %eax,%ecx movw (%edi),%ax /* (DI,U) */ movd (%esi),%mm3 /* mm3 is (0,0,RP,RP) */ movw %cx,(%edi) movd %eax,%mm2 /* mm2 is (0,0,DI,U) */ rorl $16,%eax movd %eax,%mm1 /* mm1 is (0,0,U,DI) */ movq %mm1,%mm0 pmullw %mm3,%mm0 pmulhw %mm3,%mm1 punpcklwd %mm1,%mm0 /* mm0 is (RP*U,RP*DI) */ paddd %mm4,%mm0 /* mm4 is 0x00004000,0x00004000 */ psrad $15,%mm0 /* (RP*U,RP*DI) adjusted */ packssdw %mm0,%mm0 /* (*,*,RP*U,RP*DI) adjusted and saturated to word */ paddsw %mm2,%mm0 /* mm0 is (?,?, DI', U') */ movd %mm0,%eax /* (DI,U') */ addl $2,%edi addl $4,%esi cmpl lv_u_top(%ebp),%edi jb .L185 rorl $16,%eax movw %ax,(%edx) /* last DI goes to *s */ addl $2,%edx /* next s */ cmpl lv_s_top(%ebp),%edx jb .L181 .p2align 2.L179: emms popl %esi popl %edi leave ret.Lfe7: .size Short_term_analysis_filteringx,.Lfe7-Short_term_analysis_filteringx.end/* 'as' macro's seem to be case-insensitive */.macro STEP n.if \n movd \n(%esi),%mm3 /* mm3 is (0,0,RP,RP) */.else movd (%esi),%mm3 /* mm3 is (0,0,RP,RP) */.endif movq %mm5,%mm1; movd %mm4,%ecx; movw %cx,%ax /* (DI,U) */ psllq $48,%mm1; psrlq $16,%mm4; por %mm1,%mm4 psllq $48,%mm0; psrlq $16,%mm5; por %mm0,%mm5 movd %eax,%mm2 /* mm2 is (0,0,DI,U) */ rorl $16,%eax movd %eax,%mm1 /* mm1 is (0,0,U,DI) */ movq %mm1,%mm0 pmullw %mm3,%mm0 pmulhw %mm3,%mm1 punpcklwd %mm1,%mm0 /* mm0 is (RP*U,RP*DI) */ paddd %mm6,%mm0 /* mm6 is 0x00004000,0x00004000 */ psrad $15,%mm0 /* (RP*U,RP*DI) adjusted */ packssdw %mm0,%mm0 /* (*,*,RP*U,RP*DI) adjusted and saturated to word */ paddsw %mm2,%mm0 /* mm0 is (?,?, DI', U') */ movd %mm0,%eax /* (DI,U') */.endm/* void Short_term_analysis_filtering (short *u0, const short *rp0, int kn, short *s) */ .equiv pm_u0,8 .equiv pm_rp0,12 .equiv pm_kn,16 .equiv pm_s,20 .equiv lv_rp_top,-4 .equiv lv_s_top,-8 .equiv lv_rp,-40 /* local version of rp0 with each word twice */ .align 4.globl Short_term_analysis_filteringx .type Short_term_analysis_filteringx,@functionShort_term_analysis_filteringx: pushl %ebp movl %esp,%ebp subl $56,%esp pushl %edi pushl %esi pushl %ebx movl pm_rp0(%ebp),%esi; leal lv_rp(%ebp),%edi; cld lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw lodsw; stosw; stosw movl %edi,lv_rp_top(%ebp) emms movl $0x4000,%eax; movd %eax,%mm6; punpckldq %mm6,%mm6 /* (0x00004000,0x00004000) for rounding dword product pairs */ movl pm_u0(%ebp),%ebx movq (%ebx),%mm4; movq 8(%ebx),%mm5 /* the 8 u's */ movl pm_s(%ebp),%edx /* edx is local s ptr throughout below */ movl pm_kn(%ebp),%eax leal (%edx,%eax,2),%eax movl %eax,lv_s_top(%ebp) cmpl %eax,%edx jae .L179 .p2align 2.L181: leal lv_rp(%ebp),%esi /* RP */ movw (%edx),%ax /* (0,DI) */ roll $16,%eax movw (%edx),%ax /* (DI,DI) */ movd %eax,%mm0 .p2align 2.L185: /* RP is %esi */ step 0 step 4 step 8 step 12/* step 16 step 20 step 24 step 28*/ addl $16,%esi cmpl lv_rp_top(%ebp),%esi jb .L185 rorl $16,%eax movw %ax,(%edx) /* last DI goes to *s */ addl $2,%edx /* next s */ cmpl lv_s_top(%ebp),%edx jb .L181.L179: movq %mm4,(%ebx); movq %mm5,8(%ebx) /* the 8 u's */ emms popl %ebx popl %esi popl %edi leave ret.Lfe7: .size Short_term_analysis_filteringx,.Lfe7-Short_term_analysis_filteringx .ident "GCC: (GNU) 2.95.2 19991109 (Debian GNU/Linux)"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -