📄 k6opt.s
字号:
/* k6opt.s vector functions optimized for MMX extensions to x86
*
* Copyright (C) 1999 by Stanley J. Brooks <stabro@megsinet.net>
*
* Any use of this software is permitted provided that this notice is not
* removed and that neither the authors nor the Technische Universitaet Berlin
* are deemed to have made any representations as to the suitability of this
* software for any purpose nor are held responsible for any defects of
* this software. THERE IS ABSOLUTELY NO WARRANTY FOR THIS SOFTWARE;
* not even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE.
*
* Chicago, 03.12.1999
* Stanley J. Brooks
*/
.file "k6opt.s"
.version "01.01"
/* gcc2_compiled.: */
.section .rodata
.align 4
.type coefs,@object
.size coefs,24
coefs:
.value -134
.value -374
.value 0
.value 2054
.value 5741
.value 8192
.value 5741
.value 2054
.value 0
.value -374
.value -134
.value 0
.text
.align 4
/* void Weighting_filter (const short *e, short *x) */
.globl Weighting_filter
.type Weighting_filter,@function
Weighting_filter:
pushl %ebp
movl %esp,%ebp
pushl %edi
pushl %esi
pushl %ebx
movl 12(%ebp),%edi
movl 8(%ebp),%ebx
addl $-10,%ebx
emms
movl $0x1000,%eax; movd %eax,%mm5 /* for rounding */
movq coefs,%mm1
movq coefs+8,%mm2
movq coefs+16,%mm3
xorl %esi,%esi
.p2align 2
.L21:
movq (%ebx,%esi,2),%mm0
pmaddwd %mm1,%mm0
movq 8(%ebx,%esi,2),%mm4
pmaddwd %mm2,%mm4
paddd %mm4,%mm0
movq 16(%ebx,%esi,2),%mm4
pmaddwd %mm3,%mm4
paddd %mm4,%mm0
movq %mm0,%mm4
punpckhdq %mm0,%mm4 /* mm4 has high int32 of mm0 dup'd */
paddd %mm4,%mm0;
paddd %mm5,%mm0 /* add for roundoff */
psrad $13,%mm0
packssdw %mm0,%mm0
movd %mm0,%eax /* ax has result */
movw %ax,(%edi,%esi,2)
incl %esi
cmpl $39,%esi
jle .L21
emms
popl %ebx
popl %esi
popl %edi
leave
ret
.Lfe1:
.size Weighting_filter,.Lfe1-Weighting_filter
.macro ccstep n
.if \n
movq \n(%edi),%mm1
movq \n(%esi),%mm2
.else
movq (%edi),%mm1
movq (%esi),%mm2
.endif
pmaddwd %mm2,%mm1
paddd %mm1,%mm0
.endm
.align 4
/* long k6maxcc(const short *wt, const short *dp, short *Nc_out) */
.globl k6maxcc
.type k6maxcc,@function
k6maxcc:
pushl %ebp
movl %esp,%ebp
pushl %edi
pushl %esi
pushl %ebx
emms
movl 8(%ebp),%edi
movl 12(%ebp),%esi
movl $0,%edx /* will be maximum inner-product */
movl $40,%ebx
movl %ebx,%ecx /* will be index of max inner-product */
subl $80,%esi
.p2align 2
.L41:
movq (%edi),%mm0
movq (%esi),%mm2
pmaddwd %mm2,%mm0
ccstep 8
ccstep 16
ccstep 24
ccstep 32
ccstep 40
ccstep 48
ccstep 56
ccstep 64
ccstep 72
movq %mm0,%mm1
punpckhdq %mm0,%mm1 /* mm1 has high int32 of mm0 dup'd */
paddd %mm1,%mm0;
movd %mm0,%eax /* eax has result */
cmpl %edx,%eax
jle .L40
movl %eax,%edx
movl %ebx,%ecx
.p2align 2
.L40:
subl $2,%esi
incl %ebx
cmpl $120,%ebx
jle .L41
movl 16(%ebp),%eax
movw %cx,(%eax)
movl %edx,%eax
emms
popl %ebx
popl %esi
popl %edi
leave
ret
.Lfe2:
.size k6maxcc,.Lfe2-k6maxcc
.align 4
/* long k6iprod (const short *p, const short *q, int n) */
.globl k6iprod
.type k6iprod,@function
k6iprod:
pushl %ebp
movl %esp,%ebp
pushl %edi
pushl %esi
emms
pxor %mm0,%mm0
movl 8(%ebp),%esi
movl 12(%ebp),%edi
movl 16(%ebp),%eax
leal -32(%esi,%eax,2),%edx /* edx = top - 32 */
cmpl %edx,%esi; ja .L202
.p2align 2
.L201:
ccstep 0
ccstep 8
ccstep 16
ccstep 24
addl $32,%esi
addl $32,%edi
cmpl %edx,%esi; jbe .L201
.p2align 2
.L202:
addl $24,%edx /* now edx = top-8 */
cmpl %edx,%esi; ja .L205
.p2align 2
.L203:
ccstep 0
addl $8,%esi
addl $8,%edi
cmpl %edx,%esi; jbe .L203
.p2align 2
.L205:
addl $4,%edx /* now edx = top-4 */
cmpl %edx,%esi; ja .L207
movd (%edi),%mm1
movd (%esi),%mm2
pmaddwd %mm2,%mm1
paddd %mm1,%mm0
addl $4,%esi
addl $4,%edi
.p2align 2
.L207:
addl $2,%edx /* now edx = top-2 */
cmpl %edx,%esi; ja .L209
movswl (%edi),%eax
movd %eax,%mm1
movswl (%esi),%eax
movd %eax,%mm2
pmaddwd %mm2,%mm1
paddd %mm1,%mm0
.p2align 2
.L209:
movq %mm0,%mm1
punpckhdq %mm0,%mm1 /* mm1 has high int32 of mm0 dup'd */
paddd %mm1,%mm0;
movd %mm0,%eax /* eax has result */
emms
popl %esi
popl %edi
leave
ret
.Lfe3:
.size k6iprod,.Lfe3-k6iprod
.align 4
/* void k6vsraw P3((short *p, int n, int bits) */
.globl k6vsraw
.type k6vsraw,@function
k6vsraw:
pushl %ebp
movl %esp,%ebp
pushl %esi
movl 8(%ebp),%esi
movl 16(%ebp),%ecx
andl %ecx,%ecx; jle .L399
movl 12(%ebp),%eax
leal -16(%esi,%eax,2),%edx /* edx = top - 16 */
emms
movd %ecx,%mm3
movq ones,%mm2
psllw %mm3,%mm2; psrlw $1,%mm2
cmpl %edx,%esi; ja .L306
.p2align 2
.L302: /* 8 words per iteration */
movq (%esi),%mm0
movq 8(%esi),%mm1
paddsw %mm2,%mm0
psraw %mm3,%mm0;
paddsw %mm2,%mm1
psraw %mm3,%mm1;
movq %mm0,(%esi)
movq %mm1,8(%esi)
addl $16,%esi
cmpl %edx,%esi
jbe .L302
.p2align 2
.L306:
addl $12,%edx /* now edx = top-4 */
cmpl %edx,%esi; ja .L310
.p2align 2
.L308: /* do up to 6 words, two at a time */
movd (%esi),%mm0
paddsw %mm2,%mm0
psraw %mm3,%mm0;
movd %mm0,(%esi)
addl $4,%esi
cmpl %edx,%esi
jbe .L308
.p2align 2
.L310:
addl $2,%edx /* now edx = top-2 */
cmpl %edx,%esi; ja .L315
movzwl (%esi),%eax
movd %eax,%mm0
paddsw %mm2,%mm0
psraw %mm3,%mm0;
movd %mm0,%eax
movw %ax,(%esi)
.p2align 2
.L315:
emms
.L399:
popl %esi
leave
ret
.Lfe4:
.size k6vsraw,.Lfe4-k6vsraw
.align 4
/* void k6vsllw P3((short *p, int n, int bits) */
.globl k6vsllw
.type k6vsllw,@function
k6vsllw:
pushl %ebp
movl %esp,%ebp
pushl %esi
movl 8(%ebp),%esi
movl 16(%ebp),%ecx
andl %ecx,%ecx; jle .L499
movl 12(%ebp),%eax
leal -16(%esi,%eax,2),%edx /* edx = top - 16 */
emms
movd %ecx,%mm3
cmpl %edx,%esi; ja .L406
.p2align 2
.L402: /* 8 words per iteration */
movq (%esi),%mm0
movq 8(%esi),%mm1
psllw %mm3,%mm0;
psllw %mm3,%mm1;
movq %mm0,(%esi)
movq %mm1,8(%esi)
addl $16,%esi
cmpl %edx,%esi
jbe .L402
.p2align 2
.L406:
addl $12,%edx /* now edx = top-4 */
cmpl %edx,%esi; ja .L410
.p2align 2
.L408: /* do up to 6 words, two at a time */
movd (%esi),%mm0
psllw %mm3,%mm0;
movd %mm0,(%esi)
addl $4,%esi
cmpl %edx,%esi
jbe .L408
.p2align 2
.L410:
addl $2,%edx /* now edx = top-2 */
cmpl %edx,%esi; ja .L415
movzwl (%esi),%eax
movd %eax,%mm0
psllw %mm3,%mm0;
movd %mm0,%eax
movw %ax,(%esi)
.p2align 2
.L415:
emms
.L499:
popl %esi
leave
ret
.Lfe5:
.size k6vsllw,.Lfe5-k6vsllw
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -