⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trellisupdatemmx.cpp

📁 Dream.exe soft source (Visual C++)
💻 CPP
字号:
/******************************************************************************\
 * Technische Universitaet Darmstadt, Institut fuer Nachrichtentechnik
 * Copyright (c) 2003
 *
 * Author(s):
 *	Volker Fischer, Phil Karn, Morgan Lius
 *
 * Description:
 *
	MMX fixed-point implementation of trellis update

	This code is based on a Viterbi sample code from
	Phil Karn, KA9Q (Dec 2001)
	simd-viterbi-2.0.3.zip -> viterbi27.c, mmxbfly27.s, ssebfly27.s
	homepage: http://www.ka9q.net

	Some comments to this MMX code:
	- To compare two 8-bit sized unsigned char, we need to apply a
	  special strategy:
	  psubusb mm5, mm1 // mm5 - mm1
	  pcmpeqb mm5, mm3 // mm3 = 0
	  We subtract unsigned with saturation and afterwards compare for
	  equal to zero. If value in mm1 is larger than the value in mm5, we
	  always get 0 as the result
	- Defining __asm Blocks as C Macros in windows: Put the __asm keyword in
	  front of each assembly instruction
	- If we want to use c-pointers to arrays (like "pOldTrelMetric"), we
	  first have to copy it to a register (like edx), otherwise we get
	  errors
 *
 ******************************************************************************
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
\******************************************************************************/

#include "ViterbiDecoder.h"


/* Implementation *************************************************************/
#ifdef USE_MMX
void CViterbiDecoder::TrellisUpdateMMX(const _DECISIONTYPE* pCurDec,
		const _VITMETRTYPE* pCurTrelMetric, const _VITMETRTYPE* pOldTrelMetric,
		const _VITMETRTYPE* pchMet1, const _VITMETRTYPE* pchMet2)
{
#ifdef _WIN32
	/**************************************************************************\
	* Windows                                                                  *
	\**************************************************************************/
	__asm
	{
		/* Each invocation of BFLY() will do 8 butterflies in parallel */
#		define BFLY(GROUP) \
		{ \
			/* Compute branch metrics */ \
			__asm mov edx, pOldTrelMetric /* Incoming path metric */ \
			__asm movq mm4, [edx + (8 * GROUP)] /* high bit = 0 */ \
			__asm movq mm5, [edx + ((8 * GROUP) + 32)] /* high bit = 1 */ \
			__asm mov edx, pchMet1 \
			__asm movq mm0, [edx + (8 * GROUP)] \
			__asm mov edx, pchMet2 \
			__asm movq mm3, [edx + (8 * GROUP)] \
			\
			__asm movq mm1, mm4 /* first set (mm1, mm2) */ \
			__asm paddusb mm1, mm0 /* first set: decision for bit = 0 (mm1) */ \
			__asm movq mm2, mm5 \
			__asm paddusb mm2, mm3 /* first set: decision for bit = 1 (mm2) */ \
			__asm movq mm6, mm4 /* second set (mm6, mm7) */ \
			__asm paddusb mm6, mm3 /* second set: decision for bit = 0 (mm6) */ \
			__asm movq mm7, mm5 \
			__asm paddusb mm7, mm0 /* second set: decision for bit = 1 (mm7) */ \
			\
			/* live registers 1 2 6 7. Compare mm1 and mm2; mm6 and mm7 */ \
			__asm movq mm5, mm2 \
			__asm movq mm4, mm7 \
			__asm psubusb mm5, mm1 /* mm5 = mm2 - mm1 */ \
			__asm psubusb mm4, mm6 /* mm4 = mm7 - mm6 */ \
			__asm pxor mm3, mm3 /* zero mm3 register, needed for comparison */ \
			__asm pcmpeqb mm5, mm3 /* mm5 = first set of decisions */ \
			__asm pcmpeqb mm4, mm3 /* mm4 = second set of decisions */ \
			\
			/* live registers 1 2 4 5 6 7. Select survivors. Avoid jumps
			   -> mask results with AND and ANDN. then OR */ \
			__asm movq mm3, mm5 \
			__asm movq mm0, mm4 \
			__asm pand mm2, mm5 \
			__asm pand mm7, mm4 \
			__asm pandn mm3, mm1 \
			__asm pandn mm0, mm6 \
			__asm por mm2, mm3 /* mm2: first set survivors (decisions in mm5) */ \
			__asm por mm7, mm0 /* mm7: second set survivors (decisions in mm4) */ \
			\
			/* live registers 2 4 5 7 */ \
			/* interleave & store decisions in mm4, mm5 */ \
			/* interleave & store new branch metrics in mm2, mm7 */ \
			__asm movq mm3, mm5 \
			__asm movq mm0, mm2 \
			__asm punpcklbw mm3, mm4 /* interleave first 8 decisions */ \
			__asm punpckhbw mm5, mm4 /* interleave second 8 decisions */ \
			__asm punpcklbw mm0, mm7 /* interleave first 8 new metrics */ \
			__asm punpckhbw mm2, mm7 /* interleave second 8 new metrics */ \
			__asm mov edx, pCurDec \
			__asm movq [edx + (16 * GROUP)], mm3 \
			__asm movq [edx + (16 * GROUP + 8)], mm5 \
			__asm mov edx, pCurTrelMetric \
			__asm movq [edx + (16 * GROUP)], mm0 /* new metrics */ \
			__asm movq [edx + (16 * GROUP + 8)], mm2 \
		}

		/* Invoke macro 4 times for a total of 32 butterflies */
		BFLY(0)
		BFLY(1)
		BFLY(2)
		BFLY(3)


		/* -----------------------------------------------------------------
		   Normalize by finding smallest metric and subtracting it
		   from all metrics */

#if 1 // if 0, always normalize
		/* See if we have to normalize */
		mov eax, [edx] /* Extract first output metric */
		and eax, 255
		cmp eax, 150 /* Is it greater than 150? */
		mov eax, 0
		jle done /* No, no need to normalize */
#endif

		/* Search for the minimum metric. Result ist stored in mm0 */
#		define PMINUB_MM0_MM1 \
		{ \
			__asm movq mm2, mm0 \
			__asm psubusb mm2, mm1 /* mm2 = mm0 - mm1 */ \
			__asm pxor mm3, mm3 /* zero mm3 register, needed for comparison */ \
			__asm pcmpeqb mm2, mm3 /* decisions */ \
			\
			__asm pand mm0, mm2 \
			__asm pandn mm2, mm1 \
			__asm por mm0, mm2 \
		}

		/* Search for minimum, byte-wise for whole register */
		movq mm0, [edx]
		movq mm1, [edx + 8]
		PMINUB_MM0_MM1
		movq mm1, [edx + 16]
		PMINUB_MM0_MM1
		movq mm1, [edx + 24]
		PMINUB_MM0_MM1
		movq mm1, [edx + 32]
		PMINUB_MM0_MM1
		movq mm1, [edx + 40]
		PMINUB_MM0_MM1
		movq mm1, [edx + 48]
		PMINUB_MM0_MM1
		movq mm1, [edx + 56]
		PMINUB_MM0_MM1

		/* mm0 contains 8 smallest metrics
		   crunch down to single lowest metric */
		movq mm1, mm0
		psrlq mm0, 32 /* Compare lowest 4 bytes with highest 4 bytes */
		PMINUB_MM0_MM1 /* -> results are in lowest 4 bytes */
		movq mm1, mm0
		psrlq mm0, 16 /* Compare lowest 2 bytes with mext 2 bytes */
		PMINUB_MM0_MM1 /* -> results are in lowest 2 bytes */
		movq mm1, mm0
		psrlq mm0, 8 /* Compare lowest byte with second lowest byte */
		PMINUB_MM0_MM1 /* -> resulting minium metric is in lowest byte */

		/* Expand value in lowest byte to all 8 bytes */
		punpcklbw mm0,mm0 /* First 2 bytes have same value */
		punpcklbw mm0,mm0 /* First 4 bytes have same value */
		punpcklbw mm0,mm0 /* All bytes are the same */


		/* mm0 now contains lowest metric in all 8 bytes
		   subtract it from every output metric. Trashes mm7 */
#		define PSUBUSBM(MEM, REG) \
		{ \
			__asm movq mm7, MEM \
			__asm psubusb mm7, REG \
			__asm movq MEM, mm7 \
		}

		PSUBUSBM([edx], mm0)
		PSUBUSBM([edx + 8], mm0)
		PSUBUSBM([edx + 16], mm0)
		PSUBUSBM([edx + 24], mm0)
		PSUBUSBM([edx + 32], mm0)
		PSUBUSBM([edx + 40], mm0)
		PSUBUSBM([edx + 48], mm0)
		PSUBUSBM([edx + 56], mm0)


	done:
		/* Needed, when we have used mmx registers and want to use floating
		   point operations afterwards */
		emms

#undef BFLY
#undef MINIMUM
#undef PSUBUSBM
	}
#else
	/**************************************************************************\
	* Linux                                                                    *
	\**************************************************************************/
	/* Each invocation of BFLY() will do 8 butterflies in parallel */
#	define BFLY(GROUP) \
		/* Compute branch metrics */ \
		"mov %1,%%edx; " /* Incoming path metric (input) */ \
		"movq (8 * "GROUP")(%%edx),%%mm4; " /* high bit = 0 */ \
		"movq ((8 * "GROUP")+32)(%%edx),%%mm5; "	/* high bit = 1  */ \
		"mov %2,%%eax; "  \
		"mov %3,%%ebx ;"   \
		"movq (8 * "GROUP")(%%eax),%%mm0; "  \
		"movq (8 * "GROUP")(%%ebx),%%mm3; "  \
		\
		"movq %%mm4,%%mm1; " /* first set (mm1, mm2) */ \
		"paddusb %%mm0,%%mm1; " /* first set: decision for bit = 0 (mm1) */ \
		"movq %%mm5,%%mm2; " \
		"paddusb %%mm3,%%mm2; " /* first set: decision for bit = 1 (mm2) */ \
		"movq %%mm4,%%mm6;" /* second set (mm6, mm7) */ \
		"paddusb %%mm3,%%mm6; " /* second set: decision for bit = 0 (mm6) */ \
		"movq %%mm5,%%mm7; " \
		"paddusb %%mm0,%%mm7; " /* second set: decision for bit = 1 (mm7) */ \
		\
		/* live registers 1 2 6 7. Compare mm1 and mm2; mm6 and mm7 */ \
		"movq %%mm2,%%mm5; " \
		"movq %%mm7,%%mm4; " \
		"psubusb %%mm1,%%mm5; " /* mm5 = mm2 - mm1 */ \
		"psubusb %%mm6,%%mm4; " /* mm4 = mm7 - mm6 */ \
		"pxor %%mm3,%%mm3; " /* zero mm3 register, needed for comparison */ \
		"pcmpeqb %%mm3,%%mm5; " /* mm5 = first set of decisions */ \
		"pcmpeqb %%mm3,%%mm4; " /* mm4 = second set of decisions */ \
		\
		/* live registers 1 2 4 5 6 7. Select survivors. Avoid jumps */ \
		/*   -> mask results with AND and ANDN. then OR */ \
		"movq %%mm5,%%mm3; " \
		"movq %%mm4,%%mm0; " \
		"pand %%mm5,%%mm2; " \
		"pand %%mm4,%%mm7; " \
		"pandn %%mm1,%%mm3; " \
		"pandn %%mm6,%%mm0; " \
		"por %%mm3,%%mm2; " /* mm2: first set survivors (decisions in mm5) */ \
		"por %%mm0,%%mm7; " /* mm7: second set survivors (decisions in mm4) */ \
		\
		/* live registers 2 4 5 7 */ \
		/* interleave & store decisions in mm4, mm5 */ \
		/* interleave & store new branch metrics in mm2, mm7 */ \
		"movq %%mm5,%%mm3; " \
		"movq %%mm2,%%mm0; " \
		"punpcklbw %%mm4,%%mm3; " /* interleave first 8 decisions */ \
		"punpckhbw %%mm4,%%mm5; " /* interleave second 8 decisions */ \
		"punpcklbw %%mm7,%%mm0; " /* interleave first 8 new metrics */ \
		"punpckhbw %%mm7,%%mm2; " /* interleave second 8 new metrics */ \
		"mov %4,%%edx; " \
		"movq %%mm3,(16 * "GROUP")(%%edx); "  \
		"movq %%mm5,((16 * "GROUP") + 8)(%%edx); "   \
		"mov %0,%%edx; " \
		"movq %%mm0,(16 * "GROUP")(%%edx); " /* new metrics */ \
		"movq %%mm2,((16 * "GROUP") + 8)(%%edx); " \


	asm
	(
		/* Invoke macro 4 times for a total of 32 butterflies */
		BFLY("0")
		BFLY("1")
		BFLY("2")
		BFLY("3")


		/* -----------------------------------------------------------------
		   Normalize by finding smallest metric and subtracting it
		   from all metrics */

#if 1 // if 0, always normalize
		/* See if we have to normalize */
		
		"mov (%%edx),%%eax; " /* Extract first output metric */
		"and $255,%%eax; "
		"cmp $150,%%eax; " /* Is it greater than 150? */
		"mov $0,%%eax ;"
		"jle done; " /* No, no need to normalize.  */

#endif

		/* Search for the minimum metric. Result ist stored in mm0 */
#		define PMINUBMM0MM1 \
			"movq %%mm0,%%mm2; " \
			"psubusb %%mm1,%%mm2; " /* mm2 = mm0 - mm1 */ \
			"pxor %%mm3,%%mm3; " /* zero mm3 register, needed for comparison */ \
			"pcmpeqb %%mm3,%%mm2; " /* decisions */ \
			"pand %%mm2,%%mm0; " \
			"pandn %%mm1,%%mm2; " \
			"por %%mm2,%%mm0; "


		/* Search for minimum, byte-wise for whole register */
		"movq (%%edx),%%mm0; "
		"movq 8(%%edx),%%mm1; "
		PMINUBMM0MM1
		"movq 16(%%edx),%%mm1; "
		PMINUBMM0MM1
		"movq 24(%%edx),%%mm1;"
		PMINUBMM0MM1
		"movq 32(%%edx),%%mm1; "
		PMINUBMM0MM1
		"movq 40(%%edx),%%mm1; "
		PMINUBMM0MM1
		"movq 48(%%edx),%%mm1; "
		PMINUBMM0MM1
		"movq 56(%%edx),%%mm1; "
		PMINUBMM0MM1

		/* mm0 contains 8 smallest metrics
		   crunch down to single lowest metric */

		"movq %%mm0,%%mm1; "
		"psrlq $32,%%mm0; " /* Compare lowest 4 bytes with highest 4 bytes */
		PMINUBMM0MM1 /* -> results are in lowest 4 bytes */
		"movq %%mm0,%%mm1; "
		"psrlq $16,%%mm0; " /* Compare lowest 2 bytes with mext 2 bytes */
		PMINUBMM0MM1 /* -> results are in lowest 2 bytes */
		"movq %%mm0,%%mm1; "
		"psrlq $8,%%mm0; " /* Compare lowest byte with second lowest byte */
		PMINUBMM0MM1 /* -> resulting minium metric is in lowest byte */

		/* Expand value in lowest byte to all 8 bytes */
		"punpcklbw %%mm0,%%mm0; " /* First 2 bytes have same value */
		"punpcklbw %%mm0,%%mm0; " /* First 4 bytes have same value */
		"punpcklbw %%mm0,%%mm0; " /* All bytes are the same */


		/* mm0 now contains lowest metric in all 8 bytes
		   subtract it from every output metric. Trashes mm7 */
#		define PSUBUSBM(REG, MEM) \
			"movq "MEM",%%mm7; " \
			"psubusb "REG",%%mm7; " \
			"movq %%mm7,"MEM"; " \


		PSUBUSBM("%%mm0","(%%edx)")
		PSUBUSBM("%%mm0","8(%%edx)")
		PSUBUSBM("%%mm0","16(%%edx)")
		PSUBUSBM("%%mm0","24(%%edx)")
		PSUBUSBM("%%mm0","32(%%edx)")
		PSUBUSBM("%%mm0","40(%%edx)")
		PSUBUSBM("%%mm0","48(%%edx)")
		PSUBUSBM("%%mm0","56(%%edx)")


		"done: emms; "	/* Needed, when we have used mmx registers and want to use floating
			 point operations afterwards */

		:
		:"m"(pCurTrelMetric),"m"(pOldTrelMetric),"m"(chMet1),"m"(chMet2),"m"(pCurDec)
	);

#undef BFLY
#undef MINIMUM
#undef PSUBUSBM
#endif
}
#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -