⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ccomplex_dotprod_sse.s

📁 gnuradio软件无线电源程序.现在的手机多基于软件无线电
💻 S
字号:
## Copyright 2002 Free Software Foundation, Inc.# # This file is part of GNU Radio# # GNU Radio is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2, or (at your option)# any later version.# # GNU Radio is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the# GNU General Public License for more details.# # You should have received a copy of the GNU General Public License# along with GNU Radio; see the file COPYING.  If not, write to# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,# Boston, MA 02111-1307, USA.# # input and taps are guarenteed to be 16 byte aligned.# n_2_ccomplex_blocks is != 0#	##  ccomplex_dotprod_generic (const float *input,#                         const float *taps, unsigned n_2_ccomplex_blocks, float *result)#  {#    float sum0 = 0;#    float sum1 = 0;#    float sum2 = 0;#    float sum3 = 0;#  #    do {##      sum0 += input[0] * taps[0] - input[1] * taps[1];#      sum1 += input[0] * taps[1] + input[1] * taps[0];#      sum2 += input[2] * taps[2] - input[3] * taps[3];#      sum3 += input[2] * taps[3] + input[3] * taps[2];#  #      input += 4;#      taps += 4;  #  #    } while (--n_2_ccomplex_blocks != 0);#  #  #    result[0] = sum0 + sum2;#    result[1] = sum1 + sum3;#  }## TODO: prefetch and better scheduling	.file	"ccomplex_dotprod_sse.s"	.version	"01.01".text	.align 16.globl ccomplex_dotprod_sse	.type	 ccomplex_dotprod_sse,@functionccomplex_dotprod_sse:	pushl	%ebp	movl	%esp, %ebp	movl	8(%ebp), %eax		# input	movl	12(%ebp), %edx		# taps	movl	16(%ebp), %ecx		# n_2_ccomplex_blocks	xorps	%xmm6, %xmm6		# zero accumulators		movaps	0(%eax), %xmm0	xorps	%xmm7, %xmm7		# zero accumulators	movaps	0(%edx), %xmm2	shrl	$1, %ecx		# ecx = n_2_ccomplex_blocks / 2	jmp	.L1_test	#	# 4 taps / loop	# something like ?? cycles / loop	#		.p2align 4.loop1:	# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000##	movaps	(%eax), %xmmA#	movaps	(%edx), %xmmB##	movaps	%xmmA, %xmmZ#	shufps	$0xb1, %xmmZ, %xmmZ	# swap internals##	mulps	%xmmB, %xmmA#	mulps	%xmmZ, %xmmB##	# SSE replacement for: pfpnacc %xmmB, %xmmA#	xorps	%xmmPN, %xmmA#	movaps	%xmmA, %xmmZ#	unpcklps %xmmB, %xmmA#	unpckhps %xmmB, %xmmZ#	movaps	%xmmZ, %xmmY#	shufps	$0x44, %xmmA, %xmmZ	# b01000100#	shufps	$0xee, %xmmY, %xmmA	# b11101110#	addps	%xmmZ, %xmmA##	addps	%xmmA, %xmmC# A=xmm0, B=xmm2, Z=xmm4# A'=xmm1, B'=xmm3, Z'=xmm5	movaps	16(%eax), %xmm1	movaps	%xmm0, %xmm4	mulps	%xmm2, %xmm0	shufps	$0xb1, %xmm4, %xmm4	# swap internals	movaps	16(%edx), %xmm3	movaps	%xmm1, %xmm5	addps	%xmm0, %xmm6	mulps	%xmm3, %xmm1	shufps	$0xb1, %xmm5, %xmm5	# swap internals	addps	%xmm1, %xmm6	mulps	%xmm4, %xmm2	movaps	32(%eax), %xmm0	addps	%xmm2, %xmm7	mulps	%xmm5, %xmm3	addl	$32, %eax	movaps	32(%edx), %xmm2	addps	%xmm3, %xmm7	addl	$32, %edx.L1_test:	decl	%ecx	jge	.loop1	# We've handled the bulk of multiplies up to here.	# Let's sse if original n_2_ccomplex_blocks was odd.	# If so, we've got 2 more taps to do.		movl	16(%ebp), %ecx		# n_2_ccomplex_blocks	andl	$1, %ecx	je	.Leven		# The count was odd, do 2 more taps.	# Note that we've already got mm0/mm2 preloaded	# from the main loop.	movaps	%xmm0, %xmm4	mulps	%xmm2, %xmm0	shufps	$0xb1, %xmm4, %xmm4	# swap internals	addps	%xmm0, %xmm6	mulps	%xmm4, %xmm2	addps	%xmm2, %xmm7.Leven:	# neg inversor	xorps	%xmm1, %xmm1	movl	$0x80000000, 16(%ebp)	movss	16(%ebp), %xmm1	shufps	$0x11, %xmm1, %xmm1	# b00010001 # 0 -0 0 -0	# pfpnacc	xorps	%xmm1, %xmm6	movaps	%xmm6, %xmm2	unpcklps %xmm7, %xmm6	unpckhps %xmm7, %xmm2	movaps	%xmm2, %xmm3	shufps	$0x44, %xmm6, %xmm2	# b01000100	shufps	$0xee, %xmm3, %xmm6	# b11101110	addps	%xmm2, %xmm6					# xmm6 = r1 i2 r3 i4	movl	20(%ebp), %eax		# @result	movhlps	%xmm6, %xmm4		# xmm4 = r3 i4 ?? ??	addps	%xmm4, %xmm6		# xmm6 = r1+r3 i2+i4 ?? ??	movlps	%xmm6, (%eax)		# store low 2x32 bits (complex) to memory	emms	popl	%ebp	ret.Lfe1:	.size	 ccomplex_dotprod_sse,.Lfe1-ccomplex_dotprod_sse	.ident	"Hand coded x86 SSE assembly"

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -