📄 fast hadamard transform.txt
字号:
sub2.s1 a7,a2,a9 ; d4(6) and d4(7)
|| sub2.s2 b7,b2,b9 ; d4(22) and d4(23)
|| mpyhu.m1 a0,b13,a1 ; d4(1)
|| and.l1 a0,a15,a0 ; d4(0)
|| mpyhu.m2 b0,b13,b1 ; d4(17)
|| and.l2 b0,a15,b0 ; d4(16)
add2.s1 a7,a2,a2 ; d4(4) and d4(5)
|| add2.s2 b7,b2,b2 ; d4(20) and d4(21)
|| mpyhu.m1 a6,b13,a7 ; d4(3)
|| and.l1 a6,a15,a6 ; d4(2)
|| mpyhu.m2 b6,b13,b7 ; d4(19)
|| and.l2 b6,a15,b6 ; d4(18)
; Start of 5th stage
sub2.s1 a4,a5,a12 ; d4(10) and d4(11)
|| sub2.s2 b4,b5,b12 ; d4(26) and d4(27)
|| add.l1 a0,a1,a0 ; d5(0)
|| sub.d1 a0,a1,a1 ; d5(1)
|| add.l2 b0,b1,b0 ; d5(16)
|| sub.d2 b0,b1,b1 ; d5(17)
|| mpyhu.m1 a2,b13,a10 ; d4(5)
|| mpyhu.m2 b2,b13,b10 ; d4(21)
add2.s1 a4,a5,a4 ; d4(8) and d4(9)
|| add2.s2 b4,b5,b4 ; d4(24) and d4(25)
|| add.l1 a6,a7,a6 ; d5(2)
|| sub.d1 a6,a7,a7 ; d5(3)
|| add.l2 b6,b7,b6 ; d5(18)
|| sub.d2 b6,b7,b7 ; d5(19)
|| mpylhu.m1 a2,b13,a2 ; d4(4)
|| mpylhu.m2 b2,b13,b2 ; d4(20)
sub2.s1 a3,a8,a5 ; d4(14) and d4(15)
|| sub2.s2 b3,b8,b5 ; d4(30) and d4(31)
|| mpyhu.m1 a9,b13,a11 ; d4(6)
|| and.l1 a9,a15,a9 ; d4(7)
|| mpyhu.m2 b9,b13,b11 ; d4(22)
|| and.l2 b9,a15,b9 ; d4(23)
add2.s1 a3,a8,a3 ; d4(12) and d4(13)
|| add2.s2 b3,b8,b3 ; d4(28) and d4(29)
|| and.l1 a0,a15,a0 ; clear upper half of register
|| and.l2 b0,a15,b0 ; clear upper half of register
add.l1 a2,a10,a2 ; d5(4)
|| sub.d1 a2,a10,a10 ; d5(5)
|| add.l2 b2,b10,b2 ; d5(20)
|| sub.d2 b2,b10,b10 ; d5(21)
|| mpyhu.m1 a4,b13,a8 ; d4(9)
|| mpyhu.m2 b4,b13,b8 ; d4(25)
|| and.s1 a6,a15,a6 ; clear upper half of register
|| and.s2 b6,a15,b6 ; clear upper half of register
shl.s1 a1,16,a1 ; d5(1) in upper half
|| shl.s2 b1,16,b1 ; d5(17) in upper half
|| add.l1 a9,a11,a9 ; d5(6)
|| sub.d1 a9,a11,a11 ; d5(7)
|| add.l2 b9,b11,b9 ; d5(22)
|| sub.d2 b9,b11,b11 ; d5(23)
|| mpylhu.m1 a4,b13,a4 ; d4(8)
|| mpylhu.m2 b4,b13,b4 ; d4(24)
shl.s1 a7,16,a7 ; d5(3) in upper half
|| shl.s2 b7,16,b7 ; d5(19) in upper half
|| add.d1 a0,a1,a0 ; d5(0) and d5(1)
|| add.d2 b0,b1,b0 ; d5(16) and d5(17)
|| mpyhu.m1 a12,b13,a1 ; d4(10)
|| and.l1 a12,a15,a12 ; d4(11)
|| mpyhu.m2 b12,b13,b1 ; d4(26)
|| and.l2 b12,a15,b12 ; d4(27)
and.l1 a6,a15,a6 ; clear upper half of register
|| and.l2 b6,a15,b6 ; clear upper half of register
|| mpylhu.m1 a2,b13,a2 ; clear upper half of register
|| mpylhu.m2 b2,b13,b2 ; clear upper half of register
shl.s1 a10,16,a10 ; d5(5) in upper half
|| shl.s2 b10,16,b10 ; d5(21) in upper half
|| add.d1 a6,a7,a6 ; d5(2) and d5(3)
|| add.d2 b6,b7,b6 ; d5(18) and d5(19)
|| mpyhu.m1 a3,b13,a7 ; d4(12)
|| and.l1 a3,a15,a3 ; d4(13)
|| mpyhu.m2 b3,b13,b7 ; d4(28)
|| and.l2 b3,a15,b3 ; d4(29)
shl.s1 a11,16,a11 ; d5(7) in upper half
|| shl.s2 b11,16,b11 ; d5(23) in upper half
|| add.d1 a2,a10,a2 ; d5(4) and d5(5)
|| add.d2 b2,b10,b2 ; d5(20) and d5(21)
|| mpyhu.m1 a5,b13,a10 ; d4(14)
|| and.l1 a5,a15,a5 ; d4(15)
|| mpyhu.m2 b5,b13,b10 ; d4(30)
|| and.l2 b5,a15,b5 ; d4(31)
; End of 4th stage
stw.d1 a0,*a14++ ;
|| add.l1 a4,a8,a4 ; d5(8)
|| sub.s1 a4,a8,a8 ; d5(9)
|| add.l2 b4,b8,b4 ; d5(24)
|| sub.s2 b4,b8,b8 ; d5(25)
shl.s1 a8,16,a8 ; d5(9) in upper half
|| shl.s2 b8,16,b8 ; d5(25) in upper half
|| add.l1 a12,a1,a12 ; d5(10)
|| sub.d1 a12,a1,a1 ; d5(11)
|| add.l2 b12,b1,b12 ; d5(26)
|| sub.d2 b12,b1,b1 ; d5(27)
shl.s1 a1,16,a1 ; d5(11) in upper half
|| shl.s2 b1,16,b1 ; d5(27) in upper half
|| add.l1 a3,a7,a3 ; d5(12)
|| sub.d1 a3,a7,a7 ; d5(13)
|| add.l2 b3,b7,b3 ; d5(28)
|| sub.d2 b3,b7,b7 ; d5(29)
|| mpylhu.m1 a9,b13,a9 ; clear upper half of register
|| mpylhu.m2 b9,b13,b9 ; clear upper half of register
shl.s1 a7,16,a7 ; d5(13) in upper half
|| shl.s2 b7,16,b7 ; d5(29) in upper half
|| add.l1 a5,a10,a5 ; d5(14)
|| sub.d1 a5,a10,a10 ; d5(15)
|| add.l2 b5,b10,b5 ; d5(30)
|| sub.d2 b5,b10,b10 ; d5(31)
|| mpyhl.m2 b13,b13,b0 ; extract the counter in b0
stw.d1 a6,*a14++ ;
|| stw.d2 b0,*b14++ ;
|| shl.s1 a10,16,a10 ; d5(15) in upper half
|| shl.s2 b10,16,b10 ; d5(31) in upper half
|| add.l1 a9,a11,a9 ; d5(6) and d5(7)
|| add.l2 b9,b11,b9 ; d5(22) and d5(23)
and.l1 a4,a15,a4 ; clear upper half of register
|| and.l2 b4,a15,b4 ; clear upper half of register
|| mpylhu.m1 a12,b13,a12 ; clear upper half of register
|| mpylhu.m2 b12,b13,b12 ; clear upper half of register
and.l1 a3,a15,a3 ; clear upper half of register
|| and.l2 b3,a15,b3 ; clear upper half of register
|| mpylhu.m1 a5,b13,a5 ; clear upper half of register
|| mpylhu.m2 b5,b13,b5 ; clear upper half of register
stw.d1 a2,*a14++ ;
|| stw.d2 b6,*b14++ ;
|| add.l1 a4,a8,a4 ; d5(8) and d5(9)
|| add.l2 b4,b8,b4 ; d5(24) and d5(25)
|| sub.s2 b0,1,b0 ; decrement counter
stw.d1 a9,*a14++ ;
|| stw.d2 b2,*b14++ ;
|| add.l1 a3,a7,a3 ; d5(12) and d5(13)
|| add.l2 b3,b7,b3 ; d5(28) and d5(29)
|| add.s1 a12,a1,a12 ; d5(10) and d5(11)
|| add.s2 b12,b1,b12 ; d5(26) and d5(27)
|| mpylh.m2 b0,b13,b13 ; move counter to b13(low)
stw.d1 a4,*a14++ ;
|| stw.d2 b9,*b14++ ;
|| add.l1 a5,a10,a5 ; d5(14) and d5(15)
|| add.l2 b5,b10,b5 ; d5(30) and d5(31)
stw.d1 a12,*a14++ ;
|| stw.d2 b4,*b14++ ;
|| mvklh.s2 1,b13 ; b13(high) = 1 ( for multiplications)
||[b0] b.s1 H5_loop ; branch back
stw.d1 a3,*a14++ ;
|| stw.d2 b12,*b14++ ;
stw.d1 a5,*a14++ ;
|| stw.d2 b3,*b14++ ;
||[!b0] mv.l2 a13,b13 ;
stw.d2 b5,*b14++ ;
; End of 5th stage
; Adjust pointers
[b0] add.l1 a14,a13,a14 ; if looping back, a14 = &d(32)
|| [b0] add.l2 b14,a13,b14 ;
||[!b0] sub.s1 a14,a13,a14 ; if not looping, a14 = &d(32)
||[!b0] sub.s2 b14,b13,b14 ; if not looping, b14 = &d(48)
[!b0] sub.s1 a14,a13,a14 ; if not looping, a14 = &d(16)
||[!b0] sub.s2 b14,b13,b14 ; if not looping, b14 = &d(32)
[!b0] sub.s1 a14,a13,a14 ; if not looping, a14 = &d(0)
; Branch to H5_loop occurs here
; H6 loop (last stage)
H6:
mvk.s2 8,b0 ; initialize counter
|| mv b14,a13 ; set up load pointers:
|| add a14,4,b14 ; a14 = &d[0] ; b14 = &d[2] (upper even and
odd)
|| add b14,4,b13 ; a13 = &d[32] ; b13 = &d[34] (lower even and
odd)
ldw.d1 *a14++[2],a4 ; upper even load
|| ldw.d2 *b14++[2],b4 ; upper odd load
|| mv a14,a12 ; set up store pointers
|| mv b14,b12 ; a12 = &d[0] ; b12 = &d[2]
ldw.d1 *a13++[2],a3 ; lower even load
|| ldw.d2 *b13++[2],b3 ; lower odd load
|| mv a13,a11 ; set up store pointers
|| mv b13,b11 ; a11 = &d[0] ; b11 = &d[2]
nop 2
ldw.d1 *a14++[2],a4 ; upper even load
|| ldw.d2 *b14++[2],b4 ; upper odd load
ldw.d1 *a13++[2],a3 ; lower even load
|| ldw.d2 *b13++[2],b3 ; lower odd load
add2.s1 a4,a3,a2 ; upper/lower even -> upper even
|| add2.s2 b4,b3,b2 ; upper/lower odd -> upper odd
sub2.s1 a4,a3,a1 ; upper/lower even -> lower even
|| sub2.s2 b4,b3,b1 ; upper/lower odd -> lower odd
||[b0] sub.l2 b0,1,b0 ;
ldw.d1 *a14++[2],a4 ; upper even load
|| ldw.d2 *b14++[2],b4 ; upper odd load
||[b0] b H6loop
ldw.d1 *a13++[2],a3 ; lower even load
|| ldw.d2 *b13++[2],b3 ; lower odd load
H6loop:
add2.s1 a4,a3,a2 ; upper/lower even -> upper even
|| add2.s2 b4,b3,b2 ; upper/lower odd -> upper odd
|| stw a2,*a12++[2]
|| stw b2,*b12++[2]
sub2.s1 a4,a3,a1 ; upper/lower even -> lower even
|| sub2.s2 b4,b3,b1 ; upper/lower odd -> lower odd
||[b0] sub.l2 b0,1,b0 ;
|| stw a1,*a11++[2]
|| stw b1,*b11++[2]
ldw.d1 *a14++[2],a4 ; upper even load
|| ldw.d2 *b14++[2],b4 ; upper odd load
||[b0] b H6loop
ldw.d1 *a13++[2],a3 ; lower even load
|| ldw.d2 *b13++[2],b3 ; lower odd load
; Branch to H6loop occurs here
; Restore context
ldw.d2 *+b15[10],b3 ; pop b3
add.l1x b15,4,a8 ; copy stack pointer
ldw.d2 *b15++[2],b14 ; pop b14
|| ldw.d1 *a8++[2],a14 ; pop a14
ldw.d2 *b15++[2],b13 ; pop b13
|| ldw.d1 *a8++[2],a13 ; pop a13
ldw.d2 *b15++[2],b12 ; pop b12
|| ldw.d1 *a8++[2],a12 ; pop a12
ldw.d2 *b15++[2],b11 ; pop b11
|| ldw.d1 *a8++[2],a11 ; pop a11
|| b.s2 b3
ldw.d2 *b15++[2],b10 ; pop b10
|| ldw.d1 *a8,a10 ; pop a10
nop 4
.end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -