📄 mem_transfer_ia64.s

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 S
📖 第 1 页 / 共 2 页
字号:
上一页 12
 // The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
 // bit-values. These are "parallel-added" to the values of src. The result is
 // converted into 8-bit-values using "PACK" and stored at the adress of dst. 
 // We assume that there is no misalignment.
 //
 ///////////////////////////////////////////////////////////////////////////////
 
         .align 16
         .global transfer_16to8add_ia64#
         .proc transfer_16to8add_ia64#
 
 transfer_16to8add_ia64:
         .prologue
 
 //      *** register renaming ***
         dst = r14 
         src = r15
         stride = r16
         
         _src = r17
 
 //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
         .save ar.lc, r2
         mov oldLC = ar.lc
         mov oldPR = pr
 
 
         .body
 
 //      *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
         alloc r9 = ar.pfs, 4, 92, 0, 96
         
 //      *** Saving Paramters ***
         mov dst = r32
         mov src = r33
         mov stride = r34
         add _src = 8, r33
 
 //      *** init loop: set loop counter, epilog counter, predicates ***
         mov ar.lc = 7
         mov ar.ec = LL + UL + PAL + PL + 1
         mov pr.rot = 1 << 16
         ;;
 
 //      *** define register arrays and predicate array for software pipeline ***
         .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1]
         .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
         
         
 //      Software pipelined loop:
 //      s1_p: The values of src and dst are loaded
 //      s2_p: The dst-values are converted to 16-bit-values
 //      s3_p: The values of src and dst are added
 //      s4_p: The Results are packed into 8-bit-values
 //      s5_p: The 8-bit-values are stored at the dst-adresses
 
 
 .Loop_16to8add:
         {.mii   
                 (s1_p[0]) ld8 w_src_1[0] = [src], 16 // l鋎 die 1. H鋖fte der j. Zeile von src (i = 0..3)
                 (s1_p[0]) mov _dst[0] = dst // erh鰄t die Adresse von dst um stride
                 (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst
         }
         {.mii
                 (s1_p[0]) ld8 w_dst8[0] = [dst], stride // l鋎 die j. Zeile von dst
                 (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird f黵 i = 0..3 in 16-Bit umgewandelt
                 (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird f黵 i = 4..7 in 16-Bit umgewandelt
         }
         {.mii
                 (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // l鋎 die 2. H鋖fte der j. Zeile von src (i = 4..7)
                 (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst
                 (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die 躡erpr黤ung der Wertebereiche erfolgt automatisch
         }
         {.mmb
                 (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab
                 (s1_p[0]) nop.m 0
                 br.ctop.sptk.few .Loop_16to8add
                 ;;
         }
         
 //      *** Restore old LC and PRs ***
         mov ar.lc = oldLC
         mov pr = oldPR, -1
 
         br.ret.sptk.many b0
         .endp transfer_16to8add_ia64#
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
 //
 // transfer_8to16sub_ia64
 //
 // The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
 // Difference of cur and ref ist stored at the dct-adresses and cur is copied
 // into the ref-array.
 //
 // You must assume, that the data adressed by 'ref' are misaligned in memory.
 // But you can assume, that the other data are aligned (at least I hope so).
 //      
 ///////////////////////////////////////////////////////////////////////////////
 
         .align 16
         .global transfer_8to16sub_ia64#
         .proc transfer_8to16sub_ia64#
         
         
 transfer_8to16sub_ia64:
         .prologue
 
 //      *** register renaming ***
         oldLC = r2
         oldPR = r3
 
         zero = r0 // damit ist die Zahl "zero" = 0 gemeint
         
         //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage
         dct = r14
         cur = r15
         ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das 躡ergabeRegister in dieser Liste
         stride = r16
         
         offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtr點ken
         aoffset = r18 // Gegenst點k zum Offset,
         ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref
         ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref
         
         _dct = r21 // Register f黵 die Zieladressen des 2. dct-Blocks
 
 //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
         .save ar.lc, r2
         mov oldLC = ar.lc
         mov oldPR = pr
         
 
         .body
 
 //      *** Allocating new stackframe, define rotating registers ***
         alloc r9 = ar.pfs, 4, 92, 0, 96
         
 //      *** Saving Paramters ***
         mov dct = r32 
         mov cur = r33
         // mov ref = r34: ref is unaligned, get aligned ref below...
         mov stride = r35
         
         and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8)
         dep offset = ref, zero, 3, 3
         ;;
         add ref_a2 = 8, ref_a1
         sub aoffset = 64, offset // Gegenst點k zum Offset wird berechnet
         add _dct = 8, dct // Die Adresse f黵 den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) h鰄er als beim 1. Block
 
 //      *** init loop: set loop counter, epilog counter, predicates ***
         mov ar.lc = 7
         mov ar.ec = LL + SHL + OL + UL + PSL + 1
         mov pr.rot = 1 << 16
         ;;
 
 //      *** define register arrays and predicate array for software pipeline ***
         .rotr  c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1],  dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1]
         .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
         
 
 //      Software pipelined loop:
 //      s1_p: The values of ref and cur ale loaded, a copy of cur is made.
 //      s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
 //            shifted...
 //      s3_p: ... and copied together.
 //      s4_p: This ref-value is converted to 16-bit. The values of cur are stored
 //            at the ref-adresses.
 //      s5_p: the ref- abd cur-values are substracted...
 //      s6_p: ...and the result is stored at the dct-adresses.
 
  
 loop_8to16sub:
         {.mii
                 (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // l鋎 den 1. 64-Bit-Block, der einen Teil der ref-Daten enth鋖t
                 (s1_p[0]) mov _cur[0] = cur // cur wird f黵 sp鋞ere Verwendung gesichert
                 (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte H鋖fte wird zurechtger點kt
         }       
         {.mii
                 (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // l鋎 den 2. 64-Bit-Block
                 (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke H鋖fte wird zurechtger點kt
                 (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtger點kten Daten werden in r zusammenkopiert
         }
         {.mii
                 (s1_p[0]) ld8 c[0] = [cur], stride //l鋎 die j. Zeile von cur komplett
                 (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird f黵 i = 0..3 in 16-Bit umgewandelt
                 (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird f黵 i = 4..7 in 16-Bit umgewandelt
         }
         {.mii
                 (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt
                 //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte
                 (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird f黵 i = 0..3 in 16-Bit umgewandelt
                 (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird f黵 i = 4..7 in 16-Bit umgewandelt
         }
         {.mii
                 (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. H鋐te der j. Zeile
                 (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. H鋖fte
         }
         {.mmb
                 (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erh鰄en der Adresse um 16 Byte f黵 den n鋍hsten Wert
                 (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erh鰄en der Adresse um 16 Byte f黵 den n鋍hsten Wert
                 br.ctop.sptk.few loop_8to16sub // Und hopp
                 ;;
         }
         
 //      *** Restore old LC and PRs ***
         mov ar.lc = oldLC
         mov pr = oldPR, -1
         
         br.ret.sptk.many b0
         .endp transfer_8to16sub_ia64#
         
 
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
 //
 // transfer_8to16sub2_ia64
 //
 // At the time, this function was written, it was not yet in use.
 // We assume that the values of ref1/2 are misaligned.
 // 
 // The values of ref1/2 and cur are loaded, the ref-values need misalignment-
 // treatment. The values are converted to 16-bit using unpack. The average of
 // ref1 and ref2 is computed with pavg and substacted from cur. The results are
 // stored at the dct-adresses.
 // pavg1.raz is used to get the same results as the C-code-function. 
 // 
 /////////////////////////////////////////////////////////////////////////////// 
 
         .text
         .align 16
         .global transfer_8to16sub2_ia64#
         .proc transfer_8to16sub2_ia64#
         
 transfer_8to16sub2_ia64:
         .prologue
 
 //      *** register renaming ***
         //      We've tried to keep the C-Code names as often as possible, at least as
         //      part of register-names
         oldLC = r2
         oldPR = r3
         
         zero = r0
         
         dct_al = r14 // dct: adress of left block in one line
         dct_ar = r15 // dct: adress of right block in one line
         cur = r16
         ref1_al = r17 // ref1: aligned adress of lower part
         ref1_ah = r18 // ref1: aligned adress of higher part
         ref2_al = r19 // ref2: aligned adress of lower part
         ref2_ah = r20 // ref2: aligned adress of higher part
         stride = r21
         
         offset_1 = r22
         offset_2 = r23
         aoffset_1 = r24
         aoffset_2 = r25
 
 //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
         .save ar.lc, r2
         mov oldLC = ar.lc
         mov oldPR = pr
 
 
         .body           
 
 //      *** Saving Paramters ***
 //      *** (as inputregisters r32 + are needed for register-rotation) ***
         mov dct_ar = r32        
         add dct_al = 8, r32     
         mov cur = r33
         
         and ref1_al = -8, r34   
         and ref2_al = -8, r35   // ref2 aligned adrress of lower part
         
         mov stride = r36
         
 //      ***     Calculations for Misaligment-Handling ***
         dep offset_1 = r34, zero, 3, 3
         dep offset_2 = r35, zero, 3, 3
         ;;
         add ref1_ah = 8, ref1_al
         add ref2_ah = 8, ref2_al
         sub aoffset_1 = 64, offset_1
         sub aoffset_2 = 64, offset_2
         ;;
 
 //      *** Allocating new stackframe, define rotating registers ***
         alloc r9 = ar.pfs, 5, 91, 0, 96
         
 //      *** init loop: set loop counter, epilog counter, predicates ***
         mov ar.lc = 7
         mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1
         mov pr.rot = 1 << 16
         ;;
         
 //      *** define register arrays and predicate array for software pipeline ***
         .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
         .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
 
  
 //      software pipelined loop:
 //      ld_stage:   The values of ref1, ref2, cur are loaded
 //      sh_stage:   The misaligned values of ref1/2 are shifted...
 //      or_stage:   ...and copied together. 
 //      pavg_stage: The average of ref1 and ref2 is computed.
 //      up_stage:   The result and the cur-values are converted to 16-bit.
 //      psub_stage: Those values are substracted...
 //      st_stage:   ...and stored at the dct-adresses.
 
  
 .Loop_8to16sub2:
         {.mii
                 (ld_stage[0])   ld8 c[0] = [cur], stride
                 (sh_stage[0])   shr.u ref1_l[0] = ref1_vl[LL], offset_1
                 (sh_stage[0])   shl ref1_h[0] = ref1_vh[LL], aoffset_1
         }
         {.mii
                 (ld_stage[0])   ld8 ref1_vl[0] = [ref1_al], stride
                 (sh_stage[0])   shr.u ref2_l[0] = ref2_vl[LL], offset_2
                 (sh_stage[0])   shl ref2_h[0] = ref2_vh[LL], aoffset_2
         }
         {.mii
                 (ld_stage[0])   ld8 ref1_vh[0] = [ref1_ah], stride
                 (or_stage[0])   or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL]
                 (or_stage[0])   or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL]
         }
         {.mii
                 (ld_stage[0])   ld8 ref2_vl[0] = [ref2_al], stride
                 (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL]
                 (up_stage[0])   unpack1.l r16_r[0] = zero, r[PAVGL]
         }
         {.mii           
                 (ld_stage[0])   ld8 ref2_vh[0] = [ref2_ah], stride
                 (up_stage[0])   unpack1.h r16_l[0] = zero, r[PAVGL]
                 (up_stage[0])   unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL]
         }
         {.mii                   
                 (st_stage[0])   st8 [dct_ar] = dct16_r[PSL], 16
                 (up_stage[0])   unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL]
                 (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL]
         }
         {.mib           
                 (st_stage[0])   st8 [dct_al] = dct16_l[PSL], 16
                 (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL]             
                 br.ctop.sptk.few .Loop_8to16sub2 // Und hopp
                 ;;
         }
                 
 //      *** Restore old LC and PRs ***
         mov ar.lc = oldLC
         mov pr = oldPR, -1
 
         br.ret.sptk.many b0
         .endp transfer_8to16sub2_ia64#
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -