📄 bitmanm.h
字号:
ORRCC $d,$d,#0x01000000 ;Now bits 0, 8, 16, 24 are borrows out
; of the four bytes
RSB $t,$d,$d,LSL #8 ;Multiply these by 255 to get bytes of
; 0x00 or 0xFF according to whether
; byte generated a borrow in a-b
AND $c,$c,$t ;Generate (a byte) EOR 0 = (a byte)
EOR $c,$c,$a ; for non-borrowing bytes, (a byte)
; EOR ((a byte) EOR (b byte)) =
; (b byte) for borrowing bytes.
;This uses the same sort of ideas as the BCDADD code. To start with,
;suppose we do the 32-bit subtraction a-b. Then for each byte, we know:
;
; If (a byte) > (b byte), no "borrow" will be generated from the next byte
; up during this subtraction.
;
; If (a byte) < (b byte), a "borrow" will definitely be generated from the
; next byte up during this subtraction.
;
; If (a byte) = (b byte), a "borrow" may or may not be generated from the
; next byte during this subtraction: whether it is depends on whether this
; byte receives a "borrow" from the next byte down.
;
;So a suitable value for d has:
;
; Bit 0 = "borrow" from bit 8 generated by bit 7 during the subtraction;
; Bit 8 = "borrow" from bit 16 generated by bit 15 during the subtraction;
; Bit 16 = "borrow" from bit 24 generated by bit 23 during the subtraction;
; Bit 24 = "borrow" from outside the word generated by bit 31 during the
; subtraction;
; All other bits zero.
;
;The C flag generated by a SUBS instruction is equal to NOT("borrow" from
;outside the word), so the ORRCC instruction puts the right value into bit 24
;of d. The other three "borrows" required are found by using the fact that:
;
; (Bit i of a-b) = (a bit i) EOR (b bit i) EOR ("borrow" from bit i)
;
;so that:
;
; ("borrow" from bit i) = (a bit i) EOR (b bit i) EOR (Bit i of a-b)
;
;and so the EOR/SUBS/EOR/AND sequence puts the right values into bits 0, 8
;and 16 of d.
;
;The code then uses d to generate a bit mask equal to 0x00 in the bytes
;where a is wanted and to ((a byte) EOR (b byte)) in the bytes where b is
;wanted. Finally, this is EORed with a to produce the desired value for c.
MEND
;---------------------------------------------------------------
; Bit set in a word
; 6 cycles + possible 6 cycles for lookup table
;
;---------------------------------------------------------------
; $a input word with at most one bit set
; $c output result of finding position of bit set in a
; $table register containing pointer to address of lookup table
; if register does not contain address for lookup table,
; do not pass anything for $hastable
; table address given by BitSetTable in file bstables.s
; $hastable parameter which should only be given if $table contains a valid
; address for the lookup table that is required
; the value given can be anything and is not required to
; to be a register
; if $table is not valid, do not pass anything for this parameter
;
; the code returns in c the position of the bit that
; is set in a, or 32 if the a = 0
;
; Both registers $a and $c must be distinct from register $table
; Register $a need not be distinct from $c.
;
;---------------------------------------------------------------
IMPORT BitSetTable
MACRO
GETBITSET $c, $a, $table, $hastable
DISTINCT $a, $table
DISTINCT $c, $table
IF "$hastable" = "" ; table not given so needs defining
B %FT02 ; branch over the DCD declaration next
01 ; label for the DCD declation
DCD BitSetTable
02 LDR $table, %BT01 ; put pointer to table into $table
; LDR $table, =BitSetTable ; not used since may not be able to create
; literal pool near enough, given method will
; always work
ENDIF
;to translate the word with 0 or 1 bit set into the
;position of that bit, or a special value to indicate that
;no bits are set, one option would be to use a "divide and conquer"
;approach - start with a result of 0 and check whether the top
;16 bits are zero: if not, increment the result by 16 and shift
;the value right by 16. Then do similar things for bits 15:8,
;7:4, 3:2, 1 and 0. This takes about 15-20 instructions.
;a much faster method involves multiplication by a "magic constant".
;There are only 33 possible values if at most one bit set.
;If we can multiply by a constant which ends up with a different
;value in the top 6 bits of the product for each of these 33 values,
;we can then shift that product right by 32-6 = 26 bits and use
;it as an index into a lookup table. After some searching for a
;suitable "magic constant", this resulted in the following code:
;the following multiplication of c by 0x11 and then 0x451 uses
;ORR instructions rather than ADDs since doesn't affect the result
;because X has 0 or 1 bit set, and it speeds up the code on ARM8.
ORR $c,$a,$a,LSL #4 ;c = X * 0x11
ORR $c,$c,$c,LSL #6 ;c = X * 0x451
RSB $c,$c,$c,LSL #16 ;c = X * 0x0450FBAF
LDRB $c,[$table,$c,LSR #26] ;look up table entry indexed by top bits of c
MEND
;---------------------------------------------------------------
; Least Significant Bit set in a word
; 8 cycles + possible 6 cycles for lookup table
;
;---------------------------------------------------------------
; $a input word
; $c output result of finding least significant bit set in a
; $table register containing pointer to address of lookup table
; if register does not contain address for lookup table,
; do not pass anything for $hastable
; table address given by BitSetTable in file bstables.s
; $hastable parameter which should only be given if $table contains a valid
; address for the lookup table that is required
; the value given can be anything and is not required to
; to be a register
; if $table is not valid, do not pass anything for this parameter
;
; the code returns in c the position of the least significant bit that
; is set in a, or 32 if a = 0
;
; Both registers $a and $c must be distinct from register $table
; Register $a need not be distinct from $c.
;
;---------------------------------------------------------------
MACRO
LSBSET $c, $a, $table, $hastable
;locating the bottom bit is fairly easy, using the fact that
;the 2's complement of a number is identical to the number
;from bit 0 up to and including the least significant 1,
;and its bitwise inverse above that. So (a AND -a) will have a
;1 at the position of the least significant 1 in a and zeros
;at all other positions. (Or will be zero everywhere if a=0.)
RSB $c,$a,#0 ;Standard trick to isolate X = bottom bit of
AND $c,$c,$a ;a in c, or produce X = 0 in c if a = 0.
GETBITSET $c, $c, $table, $hastable
MEND
;---------------------------------------------------------------
; Most Significant Bit set in a word
; 12 cycles + possible 6 cycles for lookup table
;
;---------------------------------------------------------------
; $a input word
; $c output result of finding most significant bit set in a
; $table register containing pointer to address of lookup table
; if register does not contain address for lookup table,
; do not pass anything for $hastable
; table address given by BitSetTable in file bstables.s
; $hastable parameter which should only be given if $table contains a valid
; address for the lookup table that is required
; the value given can be anything and is not required to
; to be a register
; if $table is not valid, do not pass anything for this parameter
;
; the code returns in c the position of the most significant bit that
; is set in a, or 32 if a = 0
;
; Both registers $a and $c must be distinct from register $table
; Register $a need not be distinct from $c.
;
;---------------------------------------------------------------
MACRO
MSBSET $c, $a, $table, $hastable
;located the top bit
ORR $c,$a,$a,LSR #1
ORR $c,$c,$c,LSR #2
ORR $c,$c,$c,LSR #4
ORR $c,$c,$c,LSR #8
ORR $c,$c,$c,LSR #16 ;c contains 1s from the most significant
;1 in a downwards.
BIC $c,$c,$c,LSR #1 ;c now contains only the most significant 1 in a.
GETBITSET $c, $c, $table, $hastable
MEND
;---------------------------------------------------------------
; Population count
; 10 cycles + 2 register constants
;
;---------------------------------------------------------------
; $a input word
; $c output result of population count
; $t temporary register
;
; constant1 = 0x49249249
; constant2 = 0xC71C71C7
;
; the code returns in c the number of 1's set in the binary
; expression for a
;
; uses 'divide by 3 and conquer'.
;
; Registers $constant1, $constant2, $t and $c must all be distinct from each other
; Registers $a and $t must be distinct registers from each other.
; Register $a need not be distinct from one of $constant1, $constant2 or $c
;
;---------------------------------------------------------------
MACRO
POPCOUNT $c, $a, $t, $constant1, $constant2
DISTINCT $c, $t, $constant1, $constant2
DISTINCT $a, $t
; Use a divide by 3 and conquer approach
; Add up bits in the triplets 31:29,
AND $t, $a, $constant1, LSL #1
; 28:26, 25:23, 22:20, 19:17, 16:14,
SUB $c, $a, $t, LSR #1
; 13:11, 10:8, 7:5, 4:2 and the
AND $t, $c, $constant1, LSR #1
; pair 1:0
ADD $c, $c, $t
; The last four instructions basically halved the value
; of the top bit in each triplet and doubled the value
; of the bottom bit
; Add adjacent triplet sums,
ADD $c, $c, $c, LSR #3
; and then take every alternate result
AND $c, $c, $constant2
; Add the resulting sums of 3, 6, 6,
ADD $c, $c, $c, LSR #6
; 6, 6 and 5 bits together
ADD $c, $c, $c, LSR #12
ADD $c, $c, $c, LSR #24
; Throw away all the "junk" in the remaining bits
AND $c, $c, #63
MEND
;---------------------------------------------------------------
; Population count over seven words
; 46 cycles + 3 register constants
;
;---------------------------------------------------------------
; $a, $b input words a and b
; $c, $d input words c and d
; $e, $f input words e and f
; $g input word g
; $res output result of population count over seven words
;
; constant1 = 0x55555555
; constant2 = 0x33333333
; constant3 = 0x0F0F0F0F
;
; the code returns in res the number of 1's set in the binary
; expressions for all the words a, b, c, d, e, f and g
;
; uses 'divide by 3 and conquer'.
;
; Registers $a, $b, $c, $d, $e, $f and $g are corrupted.
;
; Registers $a, $b, $c, $d, $e, $f, $g, $constant1, $constant2 and
; $constant3 must all be distinct registers from each other.
; Register $res need not be distinct from any one of the other registers.
;
;---------------------------------------------------------------
; The macro definitions prefixed POPCOUNT7_INTERNAL are intended only for use
; internally to POPCOUNT7 and have no significance outside of that context
MACRO
POPCOUNT7_INTERNAL_PARALLEL_ADD_MACRO $a, $b, $c
; Calculate 2*b+a := a+b+c
EOR $a,$a,$b
BIC $b,$b,$a ;Calculates b AND NOT (a EOR b) = a AND b
EOR $a,$a,$c
BIC $c,$c,$a ;Same technique
ORR $b,$b,$c
MEND
MACRO
POPCOUNT7_INTERNAL_POPCOUNT_MACRO $c, $a, $constant1, $constant2, $constant3
; Does a population count on a, stopping at the point where each byte
; contains its own population count in the range 0-8 with the result
; stored in c - same constants as given above
BIC $c,$a,$constant1
SUB $a,$a,$c,LSR #1
BIC $c,$a,$constant2
AND $a,$a,$constant2
ADD $a,$a,$c,LSR #2
ADD $a,$a,$a,LSR #4
AND $a,$a,$constant3
MEND
MACRO
POPCOUNT7 $res, $a, $b, $c, $d, $e, $f, $g, $constant1, $constant2, $constant3
DISTINCT $a, $b, $c, $d, $e, $f, $g, $constant1, $constant2, $constant3
; We start by doing a "parallel addition" of the words. The basic idea
; here is that if we have two 32-bit values a and b, and we calculate
; x = a AND b, y = a EOR b, then at each bit position, we have:
;
; 2*(x bit) + (y bit) = (a bit) + (b bit)
;
; i.e. we have effectively added 32 1-bit numbers in parallel, storing the
; least significant bit of each sum in y and the next bit in x. This
; idea is extended here to produce something similar which does:
;
; 4*(c bit) + 2*(b bit) + (a bit)
; = (a bit)+(b bit)+(c bit)+(d bit)+(e bit)+(f bit)+(g bit)
;
; Relationships like this are denoted in an abbreviated way in the
; comments below - this one is described as 4c+2b+a :=
; a+b+c+d+e+f+g, for instance.
;
; Then we just have to calculate 4*popcount(c)+2*popcount(b)+popcount(a)
; to get the desired result. Some code is saved here by only doing part of
; the popcount operations before combining the results.
;
; The net result is 46 instructions and 46 cycles to do a population count on
; 224 bits, i.e. a counting rate of nearly 5 bits per cycle. (As a point of
; comparison, naive code which simply shifts the bits out of the registers and
; increments a register each time a 1 is shifted out is likely to end up more
; like 2-6 cycles per bit, depending on the amount of loop unrolling i.e.
; this code is about 10-30 times faster.)
; 2*b+a := a+b+c:
POPCOUNT7_INTERNAL_PARALLEL_ADD_MACRO $a, $b, $c
; 2*e+d := d+e+f similarly:
POPCOUNT7_INTERNAL_PARALLEL_ADD_MACRO $d, $e, $f
; 2*d+a := (new a)+(new d)+g similarly:
POPCOUNT7_INTERNAL_PARALLEL_ADD_MACRO $a, $d, $g
; Stuff so far has given us 2*b+2*d+2*e+a := a+b+c+d+e+f+g
; overall, so we can complete 4*c+2*b+a := a+b+c+d+e+f+g by
; doing 2*c+b := b+d+e:
; Not quite the same as POPCOUNT7_INTERNAL_PARALLEL_ADD_MACRO (though close)
EOR $b,$b,$d
BIC $c,$d,$b
EOR $b,$b,$e
BIC $e,$e,$b
ORR $c,$c,$e
; Do population count on a, stopping at the point where we have each byte
; containing its own population count in the range 0-8. Then do the same
; for each of b and c.
POPCOUNT7_INTERNAL_POPCOUNT_MACRO $g, $a, $constant1, $constant2, $constant3
POPCOUNT7_INTERNAL_POPCOUNT_MACRO $g, $b, $constant1, $constant2, $constant3
POPCOUNT7_INTERNAL_POPCOUNT_MACRO $g, $c, $constant1, $constant2, $constant3
; Add up 4*popcount(c)+2*popcount(b)+popcount(a) for each byte
; separately now, obtaining a number in a with 4 bytes in the range 0-56
; to be added together.
ADD $a,$a,$b,LSL #1
ADD $a,$a,$c,LSL #2
; Add them together in the bottom byte, discarding the junk left in the
; other bytes.
ADD $a,$a,$a,LSR #8
ADD $a,$a,$a,LSR #16
AND $res,$a,#0xFF
MEND
;;;;;;;
; END ;
;;;;;;;
END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -