📄 mem_transfer_ia64.s

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 S
📖 第 1 页 / 共 2 页
字号:
12 下一页
/////////////////////////////////////////////////////////////////////////////////// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,// University of Karlsruhe, Germany, 03.06.2002, during the laboratory// "IA-64 Video Codec Assember Parktikum" at IPD Goos.///////// legal header taken from original C-file /////////////////////////////////////////// XVID MPEG-4 VIDEO CODEC// - 8bit<->16bit transfer  -//// This program is an implementation of a part of one or more MPEG-4// Video tools as specified in ISO/IEC 14496-2 standard.  Those intending// to use this software module in hardware or software products are// advised that its use may infringe existing patents or copyrights, and// any such use would be at such party's own risk.  The original// developer of this software module and his/her company, and subsequent// editors and their companies, will have no liability for use of this// software or modifications or derivatives thereof.//// This program is free software ; you can redistribute it and/or modify// it under the terms of the GNU General Public License as published by// the Free Software Foundation ; either version 2 of the License, or// (at your option) any later version.//// This program is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY ; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the// GNU General Public License for more details.//// You should have received a copy of the GNU General Public License// along with this program ; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA/////// History ///////////////////////////////////////////////////////////////////// - 16.07.2002: several minor changes for ecc-conformity// - 03.06.2002: initial version///////////////////////////////////////////////////////////////////////////////////// Annotations:// ===========//// - All functions work on 8x8-matrices. While the C-code-functions treat each//   element seperatly, the functions in this assembler-code treat a whole line//   simultaneously. So one loop is saved.//   The remaining loop is relized by using softwarepipelining with rotating//   rregisters.// - Register renaming is used for better readability// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both//   parts are shifted and joined together with an "OR"-Instruction.// - First parameter is stored in GR 32, next in GR 33, and so on. They must be //   saved, as these GRs are used for register-rotation.// - Some of the orininal, German comments used during development are left in//   in the code. They shouldn't bother anyone.//// Anmerkungen:// ============//// - Alle Funtionen arbeiten mit 8x8-Matrizen. W鋒rend die Funktionen im C-Code//   jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-//   Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.//   Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit//   rotierenden Registern realisiert.// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Bl鯿ke//   geladen, beide Teile mit "shift"-Operationen zurechter點kt und mit einem//   logischen Oder zusammenkopiert.// - Die Parameter werden in den Registern ab GR 32 黚ergeben. Sie m黶sen ge-//   sichert werden, da die Register f黵 die register-Rotation ben鰐igt werden.// - Einige der urspr黱glichen, deutschen Kommentare aus der Entwicklungsphase//   sind im Code verblieben. Sie sollten niemanden st鰎en.///////////////////////////////////////////////////////////////////////////////////	***	define Latencies for software pipilines ***	LL  = 3 // Load	SL  = 3 // Store	PL  = 1 // Pack	SHL = 1 // Shift 	OL  = 1 // Or	UL  = 1 // Unpack	PAL = 1 // Parallel Add	PSL = 1 // Parallel Subtract	PAVGL = 1 // Parallel Avarage	.text		/////////////////////////////////////////////////////////////////////////////////// transfer8x8_copy_ia64//// SRC is missaligned, to align the source load two 8-bytes-words, shift it,// join them and store the aligned source into the destination address./////////////////////////////////////////////////////////////////////////////////	.align 16	.global transfer8x8_copy_ia64#	.proc transfer8x8_copy_ia64#	transfer8x8_copy_ia64:	.prologue	//	*** register renaming ***	zero = r0	oldLC = r2	oldPR = r3		src_1 = r14 // left aligned address of src	src_2 = r15 // right aligned address of src	dst = r16  // destination address	stride = r17		offset = r18 // shift right offset	aoffset = r19 // shift left offset	//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***	.save ar.lc, oldLC	mov oldLC = ar.lc	mov oldPR = pr	.body//	*** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***	alloc r9 = ar.pfs, 3, 29, 0, 32//	*** Saving Parameters ***	mov dst = r32	mov stride = r34	//	*** Misalingment-Treatment ***		and src_1 = -8, r33 // Computing adress of first aligned block containing src-values	dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress	;;	sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl	add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values//	*** init loop: set loop counter, epilog counter, predicates ***	mov ar.lc = 7 	mov ar.ec = LL + SHL + OL + 1	mov pr.rot = 1 << 16	;;	//	*** define register arrays and predicate array for software pipeline ***	// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left	.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]	.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]//	Software pipelined loop://	Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2//	Stage 2: Shift both values of source to SHD_R and SHD_L//	Stage 3: Join both parts together with OR//	Stage 4: Store aligned date to destination and add stride to destination address .Loop_8x8copy:	{.mii		(ld_stage[0]) ld8 src_v1[0] = [src_1], stride			(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset	}	{.mii		(ld_stage[0]) ld8 src_v2[0] = [src_2], stride			(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset		(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]	}	{.mib		(st_stage[0]) st8 [dst] = value[OL]		(st_stage[0]) add dst = dst, stride		br.ctop.sptk.few .Loop_8x8copy		;;		}	//	*** Restore old LC and PRs ***	mov ar.lc = oldLC	mov pr = oldPR, -1		br.ret.sptk.many b0		.endp transfer8x8_copy_ia64#/////////////////////////////////////////////////////////////////////////////////// transfer_8to16copy_ia64//// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,// UNPACK is used. So 8 bytes are loaded from source, unpacked to two // 4 x 16 bit values and stored to the destination. Destination is a continuous // array of 64 x 16 bit signed data. To store the next line, only 16 must be// added to the destination address.///////////////////////////////////////////////////////////////////////////////	.align 16	.global transfer_8to16copy_ia64#	.proc transfer_8to16copy_ia64#		transfer_8to16copy_ia64:	.prologue//	*** register renaming ***	oldLC = r2	oldPR = r3	zero = r0 // damit ist die Zahl "zero" = 0 gemeint		dst_1 = r14 // destination address for first 4 x 16 bit values	dst_2 = r15 // destination address for second 4 x 16 bit values	src = r16	stride = r17//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***	.save ar.lc, oldLC	mov oldLC = ar.lc	mov oldPR = pr	.body//	*** Allocating new stackframe, define rotating registers ***	alloc r9 = ar.pfs, 4, 92, 0, 96	//	*** Saving Paramters ***	mov dst_1 = r32 // fist 4 x 16 bit values	add dst_2 = 8, r32 // second 4 x 16 bit values	mov src = r33	mov stride = r34//	*** init loop: set loop counter, epilog counter, predicates ***	mov ar.lc = 7	mov ar.ec = LL + UL + 1	mov pr.rot = 1 << 16	;;	//	*** define register arrays and predicate array for software pipeline ***	// src_v = source value, dst_v1 = destination value 1	.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]	.rotp ld_stage[LL], upack_stage[UL], st_stage[1]	//	Software pipelined loop://	Stage 1: Load value of SRC//	Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data//	Stage 3: Store both 8 byte of 16 bit data.Loop_8to16copy:	{.mii		(ld_stage[0]) ld8 src_v[0] = [src], stride		(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]		(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]	}	{.mmb		(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16		(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16		br.ctop.sptk.few .Loop_8to16copy		;;	}		//	*** Restore old LC and PRs ***	mov ar.lc = oldLC	mov pr = oldPR, -1	br.ret.sptk.many b0	.endp transfer_8to16copy_ia64#	/////////////////////////////////////////////////////////////////////////////////// transfer_16to8copy_ia64//// src is a 64 x 16 bit signed continuous array. To convert the 16 bit // values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of // 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word// of 8 x 8 unsigned data to the destination.///////////////////////////////////////////////////////////////////////////////	.align 16	.global transfer_16to8copy_ia64#	.proc transfer_16to8copy_ia64#transfer_16to8copy_ia64:	.prologue//	*** register renaming ***	dst = r14 	src_1 = r15	src_2 = r17	stride = r16//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***	.save ar.lc, oldLC	mov oldLC = ar.lc	mov oldPR = pr		.body//	*** Allocating new stackframe, define rotating registers ***	alloc r9 = ar.pfs, 4, 92, 0, 96	//	*** Saving Paramters ***	mov dst = r32	mov src_1 = r33	add src_2 = 8, r33	mov stride = r34//	*** init loop: set loop counter, epilog counter, predicates ***	mov ar.lc = 7	mov ar.ec = LL + PL + 1	mov pr.rot = 1 << 16	;;//	*** define register arrays and predicate array for software pipeline ***	// src_v1 = source value 1, dst_v = destination value	.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]	.rotp ld_stage[LL], pack_stage[PL], st_stage[1]		//	Software pipelined loop://	Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data//	Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data//	Stage 3: Store the 8 byte to the destination address and add stride to//	         destination address (to get the next 8 byte line of destination).Loop_16to8copy:	{.mmi			(ld_stage[0]) ld8 src_v1[0] = [src_1], 16		(ld_stage[0]) ld8 src_v2[0] = [src_2], 16		(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]	}	{.mib		(st_stage[0]) st8 [dst] = dst_v[PL]		(st_stage[0]) add dst = dst, stride		br.ctop.sptk.few .Loop_16to8copy		;;	}	//	*** Restore old LC and PRs ***	mov ar.lc = oldLC	mov pr = oldPR, -1	br.ret.sptk.many b0	.endp transfer_16to8copy_ia64#/////////////////////////////////////////////////////////////////////////////////// transfer_16to8add_ia64//
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -