📄 ccomplex_dotprod_sse.s
字号:
## Copyright 2002 Free Software Foundation, Inc.# # This file is part of GNU Radio# # GNU Radio is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2, or (at your option)# any later version.# # GNU Radio is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.# # You should have received a copy of the GNU General Public License# along with GNU Radio; see the file COPYING. If not, write to# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,# Boston, MA 02111-1307, USA.# # input and taps are guarenteed to be 16 byte aligned.# n_2_ccomplex_blocks is != 0# ## ccomplex_dotprod_generic (const float *input,# const float *taps, unsigned n_2_ccomplex_blocks, float *result)# {# float sum0 = 0;# float sum1 = 0;# float sum2 = 0;# float sum3 = 0;# # do {## sum0 += input[0] * taps[0] - input[1] * taps[1];# sum1 += input[0] * taps[1] + input[1] * taps[0];# sum2 += input[2] * taps[2] - input[3] * taps[3];# sum3 += input[2] * taps[3] + input[3] * taps[2];# # input += 4;# taps += 4; # # } while (--n_2_ccomplex_blocks != 0);# # # result[0] = sum0 + sum2;# result[1] = sum1 + sum3;# }## TODO: prefetch and better scheduling .file "ccomplex_dotprod_sse.s" .version "01.01".text .align 16.globl ccomplex_dotprod_sse .type ccomplex_dotprod_sse,@functionccomplex_dotprod_sse: pushl %ebp movl %esp, %ebp movl 8(%ebp), %eax # input movl 12(%ebp), %edx # taps movl 16(%ebp), %ecx # n_2_ccomplex_blocks xorps %xmm6, %xmm6 # zero accumulators movaps 0(%eax), %xmm0 xorps %xmm7, %xmm7 # zero accumulators movaps 0(%edx), %xmm2 shrl $1, %ecx # ecx = n_2_ccomplex_blocks / 2 jmp .L1_test # # 4 taps / loop # something like ?? cycles / loop # .p2align 4.loop1: # complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000## movaps (%eax), %xmmA# movaps (%edx), %xmmB## movaps %xmmA, %xmmZ# shufps $0xb1, %xmmZ, %xmmZ # swap internals## mulps %xmmB, %xmmA# mulps %xmmZ, %xmmB## # SSE replacement for: pfpnacc %xmmB, %xmmA# xorps %xmmPN, %xmmA# movaps %xmmA, %xmmZ# unpcklps %xmmB, %xmmA# unpckhps %xmmB, %xmmZ# movaps %xmmZ, %xmmY# shufps $0x44, %xmmA, %xmmZ # b01000100# shufps $0xee, %xmmY, %xmmA # b11101110# addps %xmmZ, %xmmA## addps %xmmA, %xmmC# A=xmm0, B=xmm2, Z=xmm4# A'=xmm1, B'=xmm3, Z'=xmm5 movaps 16(%eax), %xmm1 movaps %xmm0, %xmm4 mulps %xmm2, %xmm0 shufps $0xb1, %xmm4, %xmm4 # swap internals movaps 16(%edx), %xmm3 movaps %xmm1, %xmm5 addps %xmm0, %xmm6 mulps %xmm3, %xmm1 shufps $0xb1, %xmm5, %xmm5 # swap internals addps %xmm1, %xmm6 mulps %xmm4, %xmm2 movaps 32(%eax), %xmm0 addps %xmm2, %xmm7 mulps %xmm5, %xmm3 addl $32, %eax movaps 32(%edx), %xmm2 addps %xmm3, %xmm7 addl $32, %edx.L1_test: decl %ecx jge .loop1 # We've handled the bulk of multiplies up to here. # Let's sse if original n_2_ccomplex_blocks was odd. # If so, we've got 2 more taps to do. movl 16(%ebp), %ecx # n_2_ccomplex_blocks andl $1, %ecx je .Leven # The count was odd, do 2 more taps. # Note that we've already got mm0/mm2 preloaded # from the main loop. movaps %xmm0, %xmm4 mulps %xmm2, %xmm0 shufps $0xb1, %xmm4, %xmm4 # swap internals addps %xmm0, %xmm6 mulps %xmm4, %xmm2 addps %xmm2, %xmm7.Leven: # neg inversor xorps %xmm1, %xmm1 movl $0x80000000, 16(%ebp) movss 16(%ebp), %xmm1 shufps $0x11, %xmm1, %xmm1 # b00010001 # 0 -0 0 -0 # pfpnacc xorps %xmm1, %xmm6 movaps %xmm6, %xmm2 unpcklps %xmm7, %xmm6 unpckhps %xmm7, %xmm2 movaps %xmm2, %xmm3 shufps $0x44, %xmm6, %xmm2 # b01000100 shufps $0xee, %xmm3, %xmm6 # b11101110 addps %xmm2, %xmm6 # xmm6 = r1 i2 r3 i4 movl 20(%ebp), %eax # @result movhlps %xmm6, %xmm4 # xmm4 = r3 i4 ?? ?? addps %xmm4, %xmm6 # xmm6 = r1+r3 i2+i4 ?? ?? movlps %xmm6, (%eax) # store low 2x32 bits (complex) to memory emms popl %ebp ret.Lfe1: .size ccomplex_dotprod_sse,.Lfe1-ccomplex_dotprod_sse .ident "Hand coded x86 SSE assembly"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -