📄 mem_transfer_ia64.s

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 S
📖 第 1 页 / 共 2 页
字号:
12 下一页
 ///////////////////////////////////////////////////////////////////////////////
 //
 // mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
 // University of Karlsruhe, Germany, 03.06.2002, during the laboratory
 // "IA-64 Video Codec Assember Parktikum" at IPD Goos.
 //
 //
 ///// legal header taken from original C-file ///////////////////////////////////////
 //
 // XVID MPEG-4 VIDEO CODEC
 // - 8bit<->16bit transfer  -
 //
 // This program is an implementation of a part of one or more MPEG-4
 // Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
 // to use this software module in hardware or software products are
 // advised that its use may infringe existing patents or copyrights, and
 // any such use would be at such party's own risk.  The original
 // developer of this software module and his/her company, and subsequent
 // editors and their companies, will have no liability for use of this
 // software or modifications or derivatives thereof.
 //
 // This program is free software ; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation ; either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY ; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program ; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 //
 ///// History /////////////////////////////////////////////////////////////////
 //
 // - 16.07.2002: several minor changes for ecc-conformity
 // - 03.06.2002: initial version
 //
 ///////////////////////////////////////////////////////////////////////////////
 //
 // Annotations:
 // ===========
 //
 // - All functions work on 8x8-matrices. While the C-code-functions treat each
 //   element seperatly, the functions in this assembler-code treat a whole line
 //   simultaneously. So one loop is saved.
 //   The remaining loop is relized by using softwarepipelining with rotating
 //   rregisters.
 // - Register renaming is used for better readability
 // - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
 //   parts are shifted and joined together with an "OR"-Instruction.
 // - First parameter is stored in GR 32, next in GR 33, and so on. They must be 
 //   saved, as these GRs are used for register-rotation.
 // - Some of the orininal, German comments used during development are left in
 //   in the code. They shouldn't bother anyone.
 //
 // Anmerkungen:
 // ============
 //
 // - Alle Funtionen arbeiten mit 8x8-Matrizen. W鋒rend die Funktionen im C-Code
 //   jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
 //   Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
 //   Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
 //   rotierenden Registern realisiert.
 // - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
 // - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Bl鯿ke
 //   geladen, beide Teile mit "shift"-Operationen zurechter點kt und mit einem
 //   logischen Oder zusammenkopiert.
 // - Die Parameter werden in den Registern ab GR 32 黚ergeben. Sie m黶sen ge-
 //   sichert werden, da die Register f黵 die register-Rotation ben鰐igt werden.
 // - Einige der urspr黱glichen, deutschen Kommentare aus der Entwicklungsphase
 //   sind im Code verblieben. Sie sollten niemanden st鰎en.
 //
 ///////////////////////////////////////////////////////////////////////////////
 
 
 //      ***     define Latencies for software pipilines ***
 
         LL  = 3 // Load
         SL  = 3 // Store
         PL  = 1 // Pack
         SHL = 1 // Shift 
         OL  = 1 // Or
         UL  = 1 // Unpack
         PAL = 1 // Parallel Add
         PSL = 1 // Parallel Subtract
         PAVGL = 1 // Parallel Avarage
 
         .text   
         
 
 ///////////////////////////////////////////////////////////////////////////////
 //
 // transfer8x8_copy_ia64
 //
 // SRC is missaligned, to align the source load two 8-bytes-words, shift it,
 // join them and store the aligned source into the destination address.
 //
 ///////////////////////////////////////////////////////////////////////////////
 
         .align 16
         .global transfer8x8_copy_ia64#
         .proc transfer8x8_copy_ia64#
         
 transfer8x8_copy_ia64:
         .prologue       
 
 //      *** register renaming ***
         zero = r0
 
         oldLC = r2
         oldPR = r3
         
         src_1 = r14 // left aligned address of src
         src_2 = r15 // right aligned address of src
         dst = r16  // destination address
         stride = r17
         
         offset = r18 // shift right offset
         aoffset = r19 // shift left offset
         
 //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
         .save ar.lc, oldLC
         mov oldLC = ar.lc
         mov oldPR = pr
 
         .body
 
 //      *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
         alloc r9 = ar.pfs, 3, 29, 0, 32
 
 //      *** Saving Parameters ***
         mov dst = r32
         mov stride = r34
         
 //      *** Misalingment-Treatment ***  
         and src_1 = -8, r33 // Computing adress of first aligned block containing src-values
         dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress
         ;;
         sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl
         add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values
 
 //      *** init loop: set loop counter, epilog counter, predicates ***
         mov ar.lc = 7 
         mov ar.ec = LL + SHL + OL + 1
         mov pr.rot = 1 << 16
         ;;
         
 //      *** define register arrays and predicate array for software pipeline ***
         // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left
         .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
         .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
 
 
 //      Software pipelined loop:
 //      Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
 //      Stage 2: Shift both values of source to SHD_R and SHD_L
 //      Stage 3: Join both parts together with OR
 //      Stage 4: Store aligned date to destination and add stride to destination address 
 
 
 .Loop_8x8copy:
         {.mii
                 (ld_stage[0]) ld8 src_v1[0] = [src_1], stride   
                 (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset
         }
         {.mii
                 (ld_stage[0]) ld8 src_v2[0] = [src_2], stride   
                 (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset
                 (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]
         }
         {.mib
                 (st_stage[0]) st8 [dst] = value[OL]
                 (st_stage[0]) add dst = dst, stride
                 br.ctop.sptk.few .Loop_8x8copy
                 ;;      
         }
         
 //      *** Restore old LC and PRs ***
         mov ar.lc = oldLC
         mov pr = oldPR, -1
         
         br.ret.sptk.many b0
         
         .endp transfer8x8_copy_ia64#
 
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
 //
 // transfer_8to16copy_ia64
 //
 // SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
 // UNPACK is used. So 8 bytes are loaded from source, unpacked to two 
 // 4 x 16 bit values and stored to the destination. Destination is a continuous 
 // array of 64 x 16 bit signed data. To store the next line, only 16 must be
 // added to the destination address.
 ///////////////////////////////////////////////////////////////////////////////
 
         .align 16
         .global transfer_8to16copy_ia64#
         .proc transfer_8to16copy_ia64#
         
         
 transfer_8to16copy_ia64:
         .prologue
 
 //      *** register renaming ***
         oldLC = r2
         oldPR = r3
 
         zero = r0 // damit ist die Zahl "zero" = 0 gemeint
         
         dst_1 = r14 // destination address for first 4 x 16 bit values
         dst_2 = r15 // destination address for second 4 x 16 bit values
         src = r16
         stride = r17
 
 //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
         .save ar.lc, oldLC
         mov oldLC = ar.lc
         mov oldPR = pr
 
 
         .body
 
 //      *** Allocating new stackframe, define rotating registers ***
         alloc r9 = ar.pfs, 4, 92, 0, 96
         
 //      *** Saving Paramters ***
         mov dst_1 = r32 // fist 4 x 16 bit values
         add dst_2 = 8, r32 // second 4 x 16 bit values
         mov src = r33
         mov stride = r34
 
 //      *** init loop: set loop counter, epilog counter, predicates ***
         mov ar.lc = 7
         mov ar.ec = LL + UL + 1
         mov pr.rot = 1 << 16
         ;;
         
 //      *** define register arrays and predicate array for software pipeline ***
         // src_v = source value, dst_v1 = destination value 1
         .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
         .rotp ld_stage[LL], upack_stage[UL], st_stage[1]
         
 
 //      Software pipelined loop:
 //      Stage 1: Load value of SRC
 //      Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
 //      Stage 3: Store both 8 byte of 16 bit data
 
 
 .Loop_8to16copy:
         {.mii
                 (ld_stage[0]) ld8 src_v[0] = [src], stride
                 (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]
                 (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]
         }
         {.mmb
                 (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16
                 (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16
                 br.ctop.sptk.few .Loop_8to16copy
                 ;;
         }
                 
 //      *** Restore old LC and PRs ***
         mov ar.lc = oldLC
         mov pr = oldPR, -1
 
         br.ret.sptk.many b0
         .endp transfer_8to16copy_ia64#
 
 
         
 
 ///////////////////////////////////////////////////////////////////////////////
 //
 // transfer_16to8copy_ia64
 //
 // src is a 64 x 16 bit signed continuous array. To convert the 16 bit 
 // values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of 
 // 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
 // of 8 x 8 unsigned data to the destination.
 ///////////////////////////////////////////////////////////////////////////////
 
         .align 16
         .global transfer_16to8copy_ia64#
         .proc transfer_16to8copy_ia64#
 transfer_16to8copy_ia64:
         .prologue
 
 //      *** register renaming ***
         dst = r14 
         src_1 = r15
         src_2 = r17
         stride = r16
 
 //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
         .save ar.lc, oldLC
         mov oldLC = ar.lc
         mov oldPR = pr
         
 
         .body
 
 //      *** Allocating new stackframe, define rotating registers ***
         alloc r9 = ar.pfs, 4, 92, 0, 96
         
 //      *** Saving Paramters ***
         mov dst = r32
         mov src_1 = r33
         add src_2 = 8, r33
         mov stride = r34
 
 //      *** init loop: set loop counter, epilog counter, predicates ***
         mov ar.lc = 7
         mov ar.ec = LL + PL + 1
         mov pr.rot = 1 << 16
         ;;
 
 //      *** define register arrays and predicate array for software pipeline ***
         // src_v1 = source value 1, dst_v = destination value
         .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]
         .rotp ld_stage[LL], pack_stage[PL], st_stage[1]
         
         
 //      Software pipelined loop:
 //      Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
 //      Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
 //      Stage 3: Store the 8 byte to the destination address and add stride to
 //               destination address (to get the next 8 byte line of destination)
 
 
 .Loop_16to8copy:
         {.mmi   
                 (ld_stage[0]) ld8 src_v1[0] = [src_1], 16
                 (ld_stage[0]) ld8 src_v2[0] = [src_2], 16
                 (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]
         }
         {.mib
                 (st_stage[0]) st8 [dst] = dst_v[PL]
                 (st_stage[0]) add dst = dst, stride
                 br.ctop.sptk.few .Loop_16to8copy
                 ;;
         }
         
 //      *** Restore old LC and PRs ***
         mov ar.lc = oldLC
         mov pr = oldPR, -1
 
         br.ret.sptk.many b0
         .endp transfer_16to8copy_ia64#
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
 //
 // transfer_16to8add_ia64
 //
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -