;
; Copyright © 2013 Raspberry Pi Foundation
; Copyright © 2013 RISC OS Open Ltd
;
; Permission to use, copy, modify, distribute, and sell this software and its
; documentation for any purpose is hereby granted without fee, provided that
; the above copyright notice appear in all copies and that both that
; copyright notice and this permission notice appear in supporting
; documentation, and that the name of the copyright holders not be used in
; advertising or publicity pertaining to distribution of the software without
; specific, written prior permission. The copyright holders make no
; representations about the suitability of this software for any purpose. It
; is provided "as is" without express or implied warranty.
;
; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
; SOFTWARE.
;
[ :LNOT: :DEF: DebugData
GBLL DebugData
]
[ :LNOT: :DEF: DebugPld
GBLL DebugPld
]
[ :LNOT: :DEF: VerboseBuild
GBLL VerboseBuild
]
; Flag bitfield definitions
FLAG_NO_HALFTONE * 0 :SHL: 0
FLAG_SCALAR_HALFTONE * 1 :SHL: 0
FLAG_VECTOR_HALFTONE * 2 :SHL: 0
FLAG_NO_COLOUR_MAP * 0 :SHL: 2
FLAG_COLOUR_MAP * 1 :SHL: 2
FLAG_DST_WRITEONLY * 0 :SHL: 3
FLAG_DST_READWRITE * 1 :SHL: 3
FLAG_SPILL_NO_LINE_VARS * 0 :SHL: 4
FLAG_SPILL_LINE_VARS_WIDE * 1 :SHL: 4
FLAG_SPILL_LINE_VARS_NON_WIDE * 2 :SHL: 4
FLAG_SPILL_LINE_VARS * 3 :SHL: 4
FLAG_EXPAND_SKEW * 0 :SHL: 6
FLAG_NO_EXPAND_SKEW * 1 :SHL: 6
FLAG_PROCESS_SERIAL * 0 :SHL: 7 ; sub-word data is presented MS-aligned, and results are expected LS-aligned
FLAG_PROCESS_PARALLEL * 1 :SHL: 7 ; sub-word data retains its original alignment throughout (only useful if src & dest depths same)
FLAG_MAX_128BIT_MACRO * 0 :SHL: 8
FLAG_MAX_256BIT_MACRO * 1 :SHL: 8 ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered
FLAG_PRELOAD_DST * 0 :SHL: 9
FLAG_NO_PRELOAD_DST * 1 :SHL: 9
; Offsets into stack
GBLA args_stack_offset
args_stack_offset SETA 9*4
GBLA locals_stack_offset
locals_stack_offset SETA 0
; Top-level macro arguments are held in variables for convenience
GBLA src_bpp
GBLA dst_w_bpp
GBLA flags
GBLA prefetch_distance
GBLS leading_pixels_reg
GBLS preload_offset_reg
GBLS line_saved_regs
GBLS init
GBLS newline
GBLS reinitwk
GBLS cleanup
; Derived values
GBLS prefix
GBLA dst_r_bpp
GBLA src_bpp_shift
GBLA dst_bpp_shift
GBLL sub_byte
GBLA num_line_saved_regs
GBLA pix_per_block
; Work registers - variables so they can be reassigned between functions
; (should always be assigned in increasing register number though)
GBLA wk0_num
GBLA wk1_num
GBLA wk2_num
GBLA wk3_num
GBLA wk4_num
GBLA wk5_num
GBLA wk6_num
GBLA wk7_num
GBLA wk8_num
GBLA wk9_num
GBLA wk10_num
; String versions of the same
GBLS wk0
GBLS wk1
GBLS wk2
GBLS wk3
GBLS wk4
GBLS wk5
GBLS wk6
GBLS wk7
GBLS wk8
GBLS wk9
GBLS wk10
[ DebugData :LOR: DebugPld
IMPORT printf
]
GBLL PrintAtStartOfLine
PrintAtStartOfLine SETL {TRUE}
MACRO
Print$cond $switch, $fmt, $reg0, $reg1, $reg2
[ Debug$switch
[ "$cond" <> "" :LAND: "$cond" <> "AL"
LCLS opp
opp SETS :REVERSE_CC: "$cond"
B$opp %FT82
]
PUSH {r12,r14}
PUSH {r0-r12}
ADD ip, sp, #15*4
STR ip, [sp, #13*4]
MRS v1, CPSR
[ "$reg0" <> ""
LDR a2, [sp, #:RCONST:$reg0 * 4]
]
[ "$reg1" <> ""
LDR a3, [sp, #:RCONST:$reg1 * 4]
]
[ "$reg2" <> ""
LDR a4, [sp, #:RCONST:$reg2 * 4]
]
ADR a1, %FT80
ADR lr, %FT81
B printf
80
[ PrintAtStartOfLine
= "$switch: "
]
= "$fmt", 0
PrintAtStartOfLine SETL "$fmt" :RIGHT: 1 = "\n"
ALIGN
81 MSR CPSR_cxsf, v1
POP {r0-r12}
ADD sp, sp, #4
POP {r14}
82
]
MEND
[ :LNOT: :DEF: |objasm$version| :LAND: :LNOT: :DEF: |ads$version|
; Assume asasm, which is lacking a number of key opcodes
MACRO
$label SEL $Rd, $Rn, $Rm
$label DCI &E6800FB0 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
MEND
MACRO
$label UADD8 $Rd, $Rn, $Rm
$label DCI &E6500F90 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
MEND
MACRO
$label USUB8 $Rd, $Rn, $Rm
$label DCI &E6500FF0 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
MEND
MACRO
$label USUB16 $Rd, $Rn, $Rm
$label DCI &E6500F70 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
MEND
MACRO
$label SETEND $endian
IF "$endian" = "LE"
$label DCI &F1010000
ELIF "$endian" = "BE"
$label DCI &F1010200
ELSE
! 1, "Unrecognised SETEND endianness"
ENDIF
MEND
]
; Add a constant, using a minimal number of ARM instructions
; Doesn't handle cases where bit 31 of constant is set, but we're not expecting any of those
MACRO
$lab AddL $dst, $src, $const
LCLA tmp
tmp SETA $const
tmp SETA tmp :OR: (((tmp :AND: &55555555) :SHL: 1) + ((tmp :AND: &AAAAAAAA) :SHR: 1))
LCLA lsb
lsb SETA tmp :AND::NOT: (tmp-1)
tmp SETA tmp :OR: (tmp :SHR: 2)
tmp SETA tmp :OR: (tmp :SHR: 4)
tmp SETA tmp :OR: (tmp :SHR: 8)
tmp SETA tmp :OR: (tmp :SHR: 16)
LCLA msb
msb SETA (tmp+1) :AND::NOT: tmp
LCLS reg
reg SETS "$src"
$lab
WHILE lsb < msb
ADD $dst, $reg, #($const) :AND: (lsb * &FF)
lsb SETA lsb * 256
reg SETS "$dst"
WEND
MEND
; Find log2 of a variable
MACRO
$out Log2 $in
[ $in = 0
$out SETA -1
|
LCLA tmp
tmp SETA $in
$out SETA 0
WHILE tmp > 1
tmp SETA tmp / 2
$out SETA $out + 1
WEND
]
MEND
; Find max of two numbers
MACRO
$out Max $a, $b
[ $a > $b
$out SETA $a
|
$out SETA $b
]
MEND
; Find if an integer is the last in a group of a power-of-2 integers
MACRO
$result IsEndOfGroup $index, $size
LCLA index
index SETA $index
LCLA size
size SETA $size
[ size < 2
$result SETL {TRUE}
|
$result SETL (index :AND::NOT: (index + 1)) :AND: (size / 2) > 0
]
MEND
; Convert an integer to a decimal string
MACRO
$str DecimalStr $num
LCLA n
n SETA $num
$str SETS ""
WHILE n <> 0
$str SETS :CHR:(48 + n % 10) :CC: $str
n SETA n / 10
WEND
IF :LEN: $str = 0
$str SETS "0"
ENDIF
MEND
; Convert a wk register index into the name of the physical register
MACRO
$str LookupWk $index
LCLS wk
wk DecimalStr $index
wk SETS "wk$wk"
$str SETS $wk
MEND
; Assign the wk registers from a list of registers
MACRO
AssignWk $list
LCLA wk_num
LCLS wk
LCLS tail
LCLS reg
wk_num SETA 0
tail SETS "$list,"
WHILE :LEN: tail > 0
wk DecimalStr wk_num
wk_num SETA wk_num + 1
reg SETS ""
WHILE tail :LEFT: 1 <> ","
reg SETS reg :CC: (tail :LEFT: 1)
tail SETS tail :RIGHT: (:LEN:tail - 1)
WEND
tail SETS tail :RIGHT: (:LEN:tail - 1)
wk$wk._num SETA :RCONST: $reg
wk$wk DecimalStr wk$wk._num
wk$wk SETS "r" :CC: wk$wk
WEND
; Ensure the remaining ones aren't used
WHILE wk_num <= 10
wk DecimalStr wk_num
wk_num SETA wk_num + 1
wk$wk._num SETA -1
wk$wk SETS "invalid_register_wk$wk"
WEND
MEND
; See if a given register name is in a comma-separated list of registers
MACRO
$out RegIsInList $reg, $list
LCLS tail
tail SETS "$list,"
WHILE :LEN: tail > 0
[ :LEN: "$reg," <= :LEN: tail
[ "$reg," = tail :LEFT: :LEN: "$reg,"
$out SETL {TRUE}
MEXIT
]
]
WHILE tail :LEFT: 1 <> ","
tail SETS tail :RIGHT: (:LEN:tail - 1)
WEND
tail SETS tail :RIGHT: (:LEN:tail - 1)
WEND
$out SETL {FALSE}
MEND
; Count how many registers are in a comma-separated list of registers
MACRO
$out CountRegsInList $list
$out SETA 1
LCLS tail
tail SETS "$list"
WHILE :LEN: tail > 0
[ tail :LEFT: 1 = ","
$out SETA $out + 1
]
tail SETS tail :RIGHT: (:LEN:tail -1)
WEND
MEND
; Data read macros
MACRO
$lab ReadFirstSubWord $base, $data, $carry, $pixels, $fixed_skew, $skew, $tmp
$lab
[ src_bpp > 0 :LAND: src_bpp < 32
LCLS reg0
reg0 LookupWk $data
IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
[ "$pixels" <> "#0"
AND $tmp, $pixels, #32/src_bpp - 1
CMP $tmp, $skew, LSR #src_bpp_shift
PrintHI Data, "ReadFirstSubWord: left@%p", $base
LDRHI $reg0, [$base], #4
PrintHI Data, " %08X\n", $reg0
]
CMP $skew, #0
PrintHI Data, "ReadFirstSubWord: right@%p", $base
LDRHI $carry, [$base], #4
PrintHI Data, " %08X\n", $carry
CMP $tmp, #0
BEQ %FT01
RSB $tmp, $skew, #32
MOV $reg0, $reg0, LSL $skew
ORR $reg0, $reg0, $carry, LSR $tmp
Print Data, "ReadFirstSubWord: skew %u -> %08X\n", $skew, $reg0
[ flags :AND: FLAG_PROCESS_PARALLEL = 0 :LAND: "$pixels" <> "#0"
AND $tmp, $pixels, #32/src_bpp - 1
MOV $tmp, $tmp, LSL #src_bpp_shift
MOV $reg0, $reg0, ROR $tmp
]
01
ELIF $fixed_skew == 0
[ "$pixels" <> "#0"
ANDS $tmp, $pixels, #32/src_bpp - 1
[ flags :AND: FLAG_PROCESS_PARALLEL = 0
BEQ %FT01
Print Data, "ReadFirstSubWord: left@%p", $base
LDR $reg0, [$base], #4
Print Data, " %08X\n", $reg0
MOV $tmp, $tmp, LSL #src_bpp_shift
MOV $reg0, $reg0, ROR $tmp
01
|
PrintNE Data, "ReadFirstSubWord: left@%p", $base
LDRNE $reg0, [$base], #4
PrintNE Data, " %08X\n", $reg0
]
]
ELSE
[ "$pixels" <> "#0"
AND $tmp, $pixels, #32/src_bpp - 1
CMP $tmp, #$fixed_skew/src_bpp
PrintHI Data, "ReadFirstSubWord: left@%p", $base
LDRHI $reg0, [$base], #4
PrintHI Data, " %08X\n", $reg0
]
Print Data, "ReadFirstSubWord: right@%p", $base
LDR $carry, [$base], #4
Print Data, " %08X\n", $carry
CMP $tmp, #0
BEQ %FT01
MOV $reg0, $reg0, LSL #$fixed_skew
ORR $reg0, $reg0, $carry, LSR #32-$fixed_skew
Print Data, "ReadFirstSubWord: skew $fixed_skew -> %08X\n", $reg0
[ flags :AND: FLAG_PROCESS_PARALLEL = 0
MOV $tmp, $tmp, LSL #src_bpp_shift
MOV $reg0, $reg0, ROR $tmp
]
01
ENDIF
]
MEND
MACRO
$lab ReadLastSubWord $base, $data, $carry, $pixels, $fixed_skew, $skew, $tmp
$lab
[ src_bpp > 0 :LAND: src_bpp < 32
LCLS reg0
reg0 LookupWk $data
IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
CMP $skew, #0
BHI %FT01
TST $pixels, #32/src_bpp - 1
PrintNE Data, "ReadLastSubWord: next@%p", $base
LDRNE $reg0, [$base], #4
PrintNE Data, " %08X\n", $reg0
B %FT02
01
Print Data, "ReadLastSubWord: left %08X\n", $carry
MOV $reg0, $carry, LSL $skew
AND $tmp, $pixels, #32/src_bpp - 1
RSB $tmp, $tmp, #32/src_bpp
CMP $tmp, $skew, LSR #src_bpp_shift
BHS %FT02
Print Data, "ReadLastSubWord: right@%p", $base
LDR $carry, [$base], #4
Print Data, " %08X\n", $carry
RSB $tmp, $skew, #32
ORR $reg0, $reg0, $carry, LSR $tmp
Print Data, "ReadLastSubWord: skew %u -> %08X\n", $skew, $reg0
02
ELIF $fixed_skew == 0
TST $pixels, #32/src_bpp - 1
PrintNE Data, "ReadLastSubWord: next@%p", $base
LDRNE $reg0, [$base], #4
PrintNE Data, " %08X\n", $reg0
ELSE
Print Data, "ReadLastSubWord: left %08X\n", $carry
MOV $reg0, $carry, LSL #$fixed_skew
AND $tmp, $pixels, #32/src_bpp - 1
RSB $tmp, $tmp, #32/src_bpp
CMP $tmp, #$fixed_skew/src_bpp
BHS %FT02
Print Data, "ReadLastSubWord: right@%p", $base
LDR $carry, [$base], #4
Print Data, " %08X\n", $carry
ORR $reg0, $reg0, $carry, LSR #32-$fixed_skew
Print Data, "ReadLastSubWord: skew $fixed_skew -> %08X\n", $reg0
02
ENDIF
]
MEND
MACRO
$lab Read1Word $base, $first, $carry, $fixed_skew, $skew, $tmp
LCLS reg0
reg0 LookupWk $first
$lab
IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
TEQ skew, #0
BNE %FT01
Print Data, "Read1Word: next@%p", $base
LDR $reg0, [$base], #4
Print Data, " %08X\n", $reg0
B %FT02
01
Print Data, "Read1Word: left %08X, right@%p", $carry, $base
MOV $reg0, $carry, LSL $skew
LDR $carry, [$base], #4
Print Data, " %08X", $carry
RSB $tmp, $skew, #32 ; no benefit to precalculating this, will stall anyway from LDR
ORR $reg0, $reg0, $carry, LSR $tmp
Print Data, ", skew %u -> %08X\n", $skew, $reg0
02
ELIF $fixed_skew = 0
Print Data, "Read1Word: next@%p", $base
LDR $reg0, [$base], #4
Print Data, " %08X\n", $reg0
ELSE
Print Data, "Read1Word: left %08X, right@%p", $carry, $base
MOV $reg0, $carry, LSL #$fixed_skew
LDR $carry, [$base], #4
Print Data, " %08X", $carry
ORR $reg0, $reg0, $carry, LSR #32-$fixed_skew
Print Data, ", skew $fixed_skew -> %08X\n", $reg0
ENDIF
MEND
MACRO
$lab Read2Words $base, $first, $carry, $fixed_skew, $skew, $tmp
LCLS reg0
reg0 LookupWk $first
LCLS reg1
reg1 LookupWk $first+1
$lab
IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
TEQ skew, #0
BNE %FT01
Print Data, "Read2Words: next@%p", $base
LDMIA $base!, {$reg0, $reg1}
Print Data, " %08X %08X\n", $reg0, $reg1
B %FT02
01
Print Data, "Read2Words: left %08X, right@%p", $carry, $base
MOV $reg0, $carry, LSL $skew
LDMIA $base!, {$reg1, $carry}
Print Data, " %08X %08X", $reg1, $carry
RSB $tmp, $skew, #32 ; no benefit to precalculating this, will stall anyway from LDM
ORR $reg0, $reg0, $reg1, LSR $tmp
MOV $reg1, $reg1, LSL $skew
ORR $reg1, $reg1, $carry, LSR $tmp
Print Data, ", skew %u -> %08X %08X\n", $skew, $reg0, $reg1
02
ELIF $fixed_skew = 0
Print Data, "Read2Words: next@%p", $base
LDMIA $base!, {$reg0, $reg1}
Print Data, " %08X %08X\n", $reg0, $reg1
ELSE
Print Data, "Read2Words: left %08X, right@%p", $carry, $base
MOV $reg0, $carry, LSL #$fixed_skew
LDMIA $base!, {$reg1, $carry}
Print Data, " %08X %08X", $reg1, $carry
ORR $reg0, $reg0, $reg1, LSR #32-$fixed_skew
MOV $reg1, $reg1, LSL #$fixed_skew
ORR $reg1, $reg1, $carry, LSR #32-$fixed_skew
Print Data, ", skew $fixed_skew -> %08X %08X\n", $reg0, $reg1
ENDIF
MEND
MACRO
$lab Read4Words $base, $first, $carry, $fixed_skew, $skew, $tmp
LCLS reg0
reg0 LookupWk $first
LCLS reg1
reg1 LookupWk $first+1
LCLS reg2
reg2 LookupWk $first+2
LCLS reg3
reg3 LookupWk $first+3
$lab
IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
TEQ skew, #0
BNE %FT01
Print Data, "Read4Words: next@%p", $base
LDMIA $base!, {$reg0, $reg1, $reg2, $reg3}
Print Data, " %08X %08X", $reg0, $reg1
Print Data, " %08X %08X\n", $reg2, $reg3
B %FT02
01
Print Data, "Read4Words: left %08X, right@%p", $carry, $base
LDMIA $base!, {$reg1, $reg2}
Print Data, " %08X %08X", $reg1, $reg2
MOV $reg0, $carry, LSL $skew
RSB $tmp, $skew, #32
LDMIA $base!, {$reg3, $carry}
Print Data, " %08X %08X", $reg3, $carry
ORR $reg0, $reg0, $reg1, LSR $tmp
MOV $reg1, $reg1, LSL $skew
ORR $reg1, $reg1, $reg2, LSR $tmp
MOV $reg2, $reg2, LSL $skew
ORR $reg2, $reg2, $reg3, LSR $tmp
MOV $reg3, $reg3, LSL $skew
ORR $reg3, $reg3, $carry, LSR $tmp
Print Data, ", skew %u -> %08X %08X", $skew, $reg0, $reg1
Print Data, " %08X %08X\n", $reg2, $reg3
02
ELIF $fixed_skew = 0
Print Data, "Read4Words: next@%p", $base
LDMIA $base!, {$reg0, $reg1, $reg2, $reg3}
Print Data, " %08X %08X", $reg0, $reg1
Print Data, " %08X %08X\n", $reg2, $reg3
ELSE
Print Data, "Read4Words: left %08X, right@%p", $carry, $base
LDMIA $base!, {$reg1, $reg2}
Print Data, " %08X %08X", $reg1, $reg2
MOV $reg0, $carry, LSL #$fixed_skew
LDMIA $base!, {$reg3, $carry}
Print Data, " %08X %08X", $reg3, $carry
ORR $reg0, $reg0, $reg1, LSR #32-$fixed_skew
MOV $reg1, $reg1, LSL #$fixed_skew
ORR $reg1, $reg1, $reg2, LSR #32-$fixed_skew
MOV $reg2, $reg2, LSL #$fixed_skew
ORR $reg2, $reg2, $reg3, LSR #32-$fixed_skew
MOV $reg3, $reg3, LSL #$fixed_skew
ORR $reg3, $reg3, $carry, LSR #32-$fixed_skew
Print Data, ", skew $fixed_skew -> %08X %08X", $reg0, $reg1
Print Data, " %08X %08X\n", $reg2, $reg3
ENDIF
MEND
MACRO
$lab Read8Words $base, $first, $carry, $fixed_skew, $skew, $tmp
LCLS reg0
reg0 LookupWk $first
LCLS reg1
reg1 LookupWk $first+1
LCLS reg2
reg2 LookupWk $first+2
LCLS reg3
reg3 LookupWk $first+3
LCLS reg4
reg4 LookupWk $first+4
LCLS reg5
reg5 LookupWk $first+5
LCLS reg6
reg6 LookupWk $first+6
LCLS reg7
reg7 LookupWk $first+7
$lab
IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
TEQ skew, #0
BNE %FT01
LDMIA $base!, {$reg0, $reg1, $reg2, $reg3, $reg4, $reg5, $reg6, $reg7}
B %FT02
01 LDMIA $base!, {$reg1, $reg2, $reg3, $reg4}
MOV $reg0, $carry, LSL $skew
RSB $tmp, $skew, #32
LDMIA $base!, {$reg5, $reg6, $reg7, $carry}
ORR $reg0, $reg0, $reg1, LSR $tmp
MOV $reg1, $reg1, LSL $skew
ORR $reg1, $reg1, $reg2, LSR $tmp
MOV $reg2, $reg2, LSL $skew
ORR $reg2, $reg2, $reg3, LSR $tmp
MOV $reg3, $reg3, LSL $skew
ORR $reg3, $reg3, $reg4, LSR $tmp
MOV $reg4, $reg4, LSL $skew
ORR $reg4, $reg4, $reg5, LSR $tmp
MOV $reg5, $reg5, LSL $skew
ORR $reg5, $reg5, $reg6, LSR $tmp
MOV $reg6, $reg6, LSL $skew
ORR $reg6, $reg6, $reg7, LSR $tmp
MOV $reg7, $reg7, LSL $skew
ORR $reg7, $reg7, $carry, LSR $tmp
02
ELIF $fixed_skew = 0
LDMIA $base!, {$reg0, $reg1, $reg2, $reg3, $reg4, $reg5, $reg6, $reg7}
ELSE
LDMIA $base!, {$reg1, $reg2, $reg3, $reg4}
MOV $reg0, $carry, LSL #$fixed_skew
LDMIA $base!, {$reg5, $reg6, $reg7, $carry}
ORR $reg0, $reg0, $reg1, LSR #32-$fixed_skew
MOV $reg1, $reg1, LSL #$fixed_skew
ORR $reg1, $reg1, $reg2, LSR #32-$fixed_skew
MOV $reg2, $reg2, LSL #$fixed_skew
ORR $reg2, $reg2, $reg3, LSR #32-$fixed_skew
MOV $reg3, $reg3, LSL #$fixed_skew
ORR $reg3, $reg3, $reg4, LSR #32-$fixed_skew
MOV $reg4, $reg4, LSL #$fixed_skew
ORR $reg4, $reg4, $reg5, LSR #32-$fixed_skew
MOV $reg5, $reg5, LSL #$fixed_skew
ORR $reg5, $reg5, $reg6, LSR #32-$fixed_skew
MOV $reg6, $reg6, LSL #$fixed_skew
ORR $reg6, $reg6, $reg7, LSR #32-$fixed_skew
MOV $reg7, $reg7, LSL #$fixed_skew
ORR $reg7, $reg7, $carry, LSR #32-$fixed_skew
ENDIF
MEND
; Data write macros
MACRO
$lab WriteFirstSubWord $base, $data, $pixels, $tmp1, $tmp2
; It is assumed that there is at least 1 pixel to write
LCLS reg0
reg0 LookupWk $data
Print Data, "WriteFirstSubWord: %08X / %u pixels @%p\n", $reg0, $pixels, $base
$lab ; Pixels should be LS-aligned whether processing was done in parallel or serial
IF dst_w_bpp < 8
AND $tmp1, $pixels, #32/dst_w_bpp - 1
MOV $tmp1, $tmp1, LSL #dst_bpp_shift
MOV $tmp2, #-1
MOV $tmp2, $tmp2, LSL $tmp1
LDR $tmp1, [$base]
BIC $reg0, $reg0, $tmp2
AND $tmp1, $tmp1, $tmp2
ORR $tmp1, $tmp1, $reg0
STR $tmp1, [$base], #4
[ "$reinitwk" <> ""
$prefix._$reinitwk "$reg0,$tmp1,$tmp2"
]
ELIF dst_w_bpp = 8
; xxaabbcc -> xx aa bbcc byte write at +2 halfword write at +0
; xxxxaabb -> xxxx aabb halfword write at +0
; xxxxxxaa -> xxxxxx aa byte write at +0
MOVS $tmp1, $pixels, LSL #31 ; C = halfword, N = byte
BHI %FT03
BCS %FT02
01 STRB $reg0, [$base], #4
B %FT00
02 STRH $reg0, [$base], #4
B %FT00
03 MOV $tmp1, $reg0, LSR #16
STRH $reg0, [$base], #2
STRB $tmp1, [$base], #2
00
[ "$reinitwk" <> ""
$prefix._$reinitwk "$tmp1"
]
ELIF dst_w_bpp = 16
STRH $reg0, [$base], #4
ENDIF
MEND
MACRO
$lab WriteLastSubWord $base, $data, $pixels, $aligned, $tmp1, $tmp2
; It is assumed that there is at least 1 pixel to write
LCLS reg0
reg0 LookupWk $data
[ DebugData
PUSH {lr}
AND lr, $pixels, #32/dst_w_bpp - 1
Print Data, "WriteLastSubWord: %08X / %u pixels @%p $aligned\n", $reg0, lr, $base
POP {lr}
]
$lab ; If pixels were processed in parallel, they'll still be MS-aligned, else they'll be LS-aligned
IF dst_w_bpp < 8
AND $tmp1, $pixels, #32/dst_w_bpp - 1
MOV $tmp1, $tmp1, LSL #dst_bpp_shift
[ "$aligned" = "ls_aligned"
RSB $tmp2, $tmp1, #32
MOV $reg0, $reg0, LSL $tmp2
]
MOV $tmp2, #-1
MOV $tmp2, $tmp2, LSR $tmp1
LDR $tmp1, [$base]
[ "$aligned" = "ms_aligned"
BIC $reg0, $reg0, $tmp2
]
AND $tmp1, $tmp1, $tmp2
ORR $tmp1, $tmp1, $reg0
STR $tmp1, [$base], #4
[ "$reinitwk" <> ""
$prefix._$reinitwk "$reg0,$tmp1,$tmp2"
]
ELIF dst_w_bpp = 8
; MS aligned case:
; aaxxxxxx -> aa xxxxxx byte write at +3
; aabbxxxx -> aabb xxxx halfword write at +2
; aabbccxx -> aabb cc xx halfword write at +2 byte write at +1
; LS aligned case:
; xxaabbcc -> aabb cc xx byte write at +1 halfword write at +2
; xxxxaabb -> aabb xxxx halfword write at +2
; xxxxxxaa -> aa xxxxxx byte write at +3
MOVS $tmp1, $pixels, LSL #31 ; C = halfword, N = byte
BHI %FT03
BCS %FT02
[ "$aligned" = "ms_aligned"
01 MOV $reg0, $reg0, LSR #24
STRB $reg0, [$base, #3]
B %FT04
02 MOV $reg0, $reg0, LSR #16
STRH $reg0, [$base, #2]
B %FT04
03 MOV $tmp1, $reg0, LSR #8
MOV $reg0, $reg0, LSR #16
STRB $tmp1, [$base, #1]
STRH $reg0, [$base, #2]
04
[ "$reinitwk" <> ""
$prefix._$reinitwk "$reg0,$tmp1"
]
|
01 STRB $reg0, [$base, #3]
B %FT04
02 STRH $reg0, [$base, #2]
B %FT04
03 MOV $tmp1, $reg0, LSR #8
STRB $reg0, [$base, #1]
STRH $tmp1, [$base, #2]
04
[ "$reinitwk" <> ""
$prefix._$reinitwk "$tmp1"
]
]
ADD $base, $base, #4
ELIF dst_w_bpp = 16
TST $pixels, #1
BEQ %FT01
[ "$aligned" = "ms_aligned"
MOV $reg0, $reg0, LSR #16
]
STRH $reg0, [$base, #2]
ADD $base, $base, #4
[ "$aligned" = "ms_aligned" :LAND: "$reinitwk" <> ""
$prefix._$reinitwk "$reg0"
]
01
ENDIF
MEND
MACRO
$lab Write1Word $base, $first
LCLS reg0
reg0 LookupWk $first
Print Data, "Write1Word: %08X @%p\n", $reg0, $base
$lab
IF (flags :AND: FLAG_DST_READWRITE) > 0
STR $reg0, [$base, #-4] ; base is assumed previously updated during read
ELSE
STR $reg0, [$base], #4
ENDIF
MEND
MACRO
$lab Write2Words $base, $first, $second
LCLS reg0
reg0 LookupWk $first
LCLS reg1
[ "$second" <> ""
reg1 LookupWk $second
|
reg1 LookupWk $first+1
]
Print Data, "Write2Words: %08X %08X @%p\n", $reg0, $reg1, $base
$lab
IF (flags :AND: FLAG_DST_READWRITE) > 0
STMDB $base, {$reg0, $reg1} ; base is assumed previously updated during read
ELSE
STMIA $base!, {$reg0, $reg1}
ENDIF
MEND
MACRO
$lab Write4Words $base, $first, $second, $third, $fourth
LCLS reg0
reg0 LookupWk $first
LCLS reg1
[ "$second" <> ""
reg1 LookupWk $second
|
reg1 LookupWk $first+1
]
LCLS reg2
[ "$third" <> ""
reg2 LookupWk $third
|
reg2 LookupWk $first+2
]
LCLS reg3
[ "$fourth" <> ""
reg3 LookupWk $fourth
|
reg3 LookupWk $first+3
]
Print Data, "Write4Words: %08X %08X", $reg0, $reg1
Print Data, " %08X %08X @%p\n", $reg2, $reg3, $base
$lab
IF (flags :AND: FLAG_DST_READWRITE) > 0
STMDB $base, {$reg0, $reg1, $reg2, $reg3} ; base is assumed previously updated during read
ELSE
STMIA $base!, {$reg0, $reg1, $reg2, $reg3}
ENDIF
MEND
; Block or sub-block macros
MACRO
$lab ProcessLeading31Bits $pixels, $fixed_skew
$lab ReadFirstSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
[ dst_w_bpp < 32
ANDS scratch, $pixels, #32/dst_w_bpp - 1
BEQ %FT02
[ flags :AND: FLAG_DST_READWRITE > 0
LDR $wk1, [dst]
]
[ flags :AND: FLAG_PROCESS_PARALLEL = 0
[ flags :AND: FLAG_DST_READWRITE > 0
MOV scratch, scratch, LSL #dst_bpp_shift
MOV $wk1, $wk1, ROR scratch
]
LCLA pow2
LCLS pow2str
pow2 SETA dst_w_bpp
WHILE pow2 <= 16
pow2str DecimalStr pow2
TST $pixels, #pow2/dst_w_bpp
BEQ %FT01
$prefix._$pow2str.bits $wk0, $wk1, $fixed_skew
01
pow2 SETA pow2 * 2
WEND
WriteFirstSubWord dst, 1, $pixels, scratch, $wk2
|
$prefix._32bits $wk0, $wk1, $fixed_skew ; and return result in $wk0
WriteFirstSubWord dst, 0, $pixels, scratch, $wk2
]
02
]
MEND
MACRO
$lab ProcessLeading127Bits $pixels, $fixed_skew
$lab ProcessLeading31Bits $pixels, $fixed_skew
TST $pixels, #32/dst_w_bpp
BEQ %FT01
$prefix._32bits $wk0, memory, $fixed_skew ; and store result to memory
01 TST $pixels, #64/dst_w_bpp
BEQ %FT01
$prefix._64bits $wk0, $fixed_skew
01
MEND
MACRO
$lab ProcessTrailing127Bits $pixels, $fixed_skew
$lab
TST $pixels, #128/dst_w_bpp - 1
BEQ %FT02
[ src_bpp > 0
[ 16 * dst_w_bpp / src_bpp >= 64
ReadLastSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
]
]
TST $pixels, #64/dst_w_bpp
BEQ %FT01
$prefix._64bits $wk0, $fixed_skew
01
[ src_bpp > 0
[ 16 * dst_w_bpp / src_bpp = 32
TST $pixels, #64/dst_w_bpp - 1
BEQ %FT01
ReadLastSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
01
]
]
TST $pixels, #32/dst_w_bpp
BEQ %FT01
$prefix._32bits $wk0, memory, $fixed_skew ; and store result to memory
01
[ dst_w_bpp < 32
TST $pixels, #32/dst_w_bpp - 1
BEQ %FT02
[ flags :AND: FLAG_DST_READWRITE > 0
LDR $wk1, [dst]
]
[ flags :AND: FLAG_PROCESS_PARALLEL = 0
LCLA pow2
LCLS pow2str
pow2 SETA 16
WHILE pow2 >= dst_w_bpp
pow2str DecimalStr pow2
[ src_bpp > 0
[ 16 * dst_w_bpp / src_bpp = pow2
TST $pixels, #2*pow2/dst_w_bpp - 1
BEQ %FT01
ReadLastSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
01
]
]
TST $pixels, #pow2/dst_w_bpp
BEQ %FT01
$prefix._$pow2str.bits $wk0, $wk1, $fixed_skew
01
pow2 SETA pow2 / 2
WEND
WriteLastSubWord dst, 1, $pixels, ls_aligned, scratch, $wk0
|
ReadLastSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
$prefix._32bits $wk0, $wk1, $fixed_skew ; and return result in $wk0
WriteLastSubWord dst, 0, $pixels, ms_aligned, scratch, $wk1
]
]
02
MEND
MACRO
$lab FunctionPrologue $spill_type, $predecrement_x
$lab PUSH {r4-r11, lr}
SUBS y, y, #1
BLO %FA99
[ src_bpp > 0
LDR src, [sp, #args_stack_offset]
LDR stride_s, [sp, #args_stack_offset+4]
]
[ flags :AND: (FLAG_SCALAR_HALFTONE :OR: FLAG_VECTOR_HALFTONE) > 0
LDR ht, [sp, #args_stack_offset+8]
]
[ flags :AND: FLAG_VECTOR_HALFTONE > 0
LDR ht_info, [sp, #args_stack_offset+12]
]
[ flags :AND: FLAG_COLOUR_MAP > 0
LDR map, [sp, #args_stack_offset+16]
]
[ sub_byte
LDR bitptrs, [sp, #args_stack_offset+20]
]
[ "$init" <> ""
$prefix._$init
]
[ $predecrement_x > 0
SUB x, x, #$predecrement_x
]
[ flags :AND: FLAG_SPILL_LINE_VARS_$spill_type = 0 :LOR: :LNOT: SpilledX
MOV orig_w, x
]
[ flags :AND: FLAG_SPILL_LINE_VARS_$spill_type > 0
PUSH {$line_saved_regs}
args_stack_offset SETA args_stack_offset + num_line_saved_regs * 4
locals_stack_offset SETA locals_stack_offset + num_line_saved_regs * 4
]
MEND
MACRO
$lab FunctionEpilogue $spill_type
$lab
[ flags :AND: FLAG_SPILL_LINE_VARS_$spill_type > 0
LDMIA sp, {$line_saved_regs}
]
[ flags :AND: FLAG_SPILL_LINE_VARS_$spill_type = 0 :LOR: :LNOT: SpilledX
MOV x, orig_w
]
SUBS y, y, #1
[ flags :AND: FLAG_SPILL_LINE_VARS_$spill_type > 0 :LAND: SpilledY
[ SpilledX
STR y, [sp, #4]
|
STR y, [sp]
]
]
ADD dst, dst, stride_d, LSL #2
[ src_bpp > 0
ADD src, src, stride_s, LSL #2
]
BHS %BA51
[ flags :AND: FLAG_SPILL_LINE_VARS_$spill_type > 0
ADD sp, sp, #num_line_saved_regs * 4
args_stack_offset SETA args_stack_offset - num_line_saved_regs * 4
locals_stack_offset SETA locals_stack_offset - num_line_saved_regs * 4
]
[ "$cleanup" <> ""
$prefix._$cleanup
]
99
POP {r4-r11, pc}
MEND
MACRO
PreloadLeadingStep1 $bpp, $ptr, $base
[ $bpp > 0
BIC $ptr, $base, #31
LCLA offset
offset SETA 0
WHILE offset <= prefetch_distance * 32
[ DebugPld
ADD $ptr, $ptr, #offset
Print Pld, "%p (leading, step 1)\n", $ptr
SUB $ptr, $ptr, #offset
]
PLD [$ptr, #offset]
offset SETA offset + 32
WEND
]
MEND
MACRO
PreloadLeadingStep2 $bpp, $bpp_shift, $ptr, $base, $leading_pixels, $tmp
[ $bpp > 0
[ $base = dst
; The test can be simplified further when preloading the destination -
; if the destination is already 16-byte aligned, or if it's in the bottom
; half of a 32-byte cacheline, then the leading pixels steps won't cause
; the destination pointer to reach the next cacheline, and even if they do,
; only 1 extra preload is required
AND $tmp, $base, #&1C
CMP $tmp, #&10
[ $bpp < 8
TSTEQ bitptrs, #&1F
|
TSTEQ stride_d, #&C0000000
]
BLS %FT01
[ DebugPld
ADD $ptr, $ptr, #(prefetch_distance+1) * 32
Print Pld, "%p (leading, step 2)\n", $ptr
SUB $ptr, $ptr, #(prefetch_distance+1) * 32
]
PLD [$ptr, #(prefetch_distance+1) * 32]
01
|
[ $bpp < 8
MOV $tmp, bitptrs, LSR #27
|
MOV $tmp, stride_s, LSR #30
]
[ $bpp <= 8
ADD $tmp, $tmp, $base, LSL #3-$bpp_shift
|
ADD $tmp, $tmp, $base, LSR #$bpp_shift-3
]
ADD $tmp, $tmp, $leading_pixels
BIC $tmp, $tmp, #256/$bpp - 1 ; now $tmp is the source cacheline corresponding to start of inner loop, in units of pixels
[ $bpp <= 8
TEQ $tmp, $ptr, LSL #3-$bpp_shift
|
TEQ $tmp, $ptr, LSR #$bpp_shift-3
]
BEQ %FT02
01
[ DebugPld
ADD $ptr, $ptr, #(prefetch_distance+1) * 32
Print Pld, "%p (leading, step 2)\n", $ptr
SUB $ptr, $ptr, #(prefetch_distance+1) * 32
]
PLD [$ptr, #(prefetch_distance+1) * 32]
ADD $ptr, $ptr, #32
[ $bpp <= 8
TEQ $tmp, $ptr, LSL #3-$bpp_shift
|
TEQ $tmp, $ptr, LSR #$bpp_shift-3
]
BNE %BT01
02
]
]
MEND
MACRO
PreloadMiddle
[ "$preload_offset_reg" <> ""
[ DebugPld
ADD src, src, $preload_offset_reg
Print Pld, "%p (middle)\n", src
SUB src, src, $preload_offset_reg
]
PLD [src, $preload_offset_reg]
|
BIC scratch, src, #31
[ DebugPld
ADD scratch, scratch, #prefetch_distance * 32
Print Pld, "%p (middle)\n", scratch
SUB scratch, scratch, #prefetch_distance * 32
]
PLD [scratch, #prefetch_distance * 32]
]
MEND
MACRO
PreloadTrailing $bpp, $bpp_shift, $base, $trailing_pixels, $fixed_skew
; We have just preloaded
; (src &~ 31) + prefetch_distance * 32
; The last pixel to be read will be
; src*8/src_bpp - skew/src_bpp + x
; Use leading_pixels_reg as a temporary (must avoid wk0, may be holding over source data when dst_bpp > 4 * src_bpp)
[ $bpp > 0
BIC $leading_pixels_reg, $base, #31
ADD $leading_pixels_reg, $leading_pixels_reg, #prefetch_distance * 32
[ $bpp <= 8
ADD scratch, $trailing_pixels, $base, LSL #3-$bpp_shift
|
ADD scratch, $trailing_pixels, $base, LSR #$bpp_shift-3
]
[ $base <> dst :LAND: $bpp < 32
IF flags :AND: FLAG_NO_EXPAND_SKEW > 0
SUB scratch, scratch, skew, LSR #$bpp_shift
ELIF $fixed_skew > 0
SUB scratch, scratch, #$fixed_skew/$bpp
ENDIF
]
BIC scratch, scratch, #256/$bpp - 1 ; last cacheline to read from (inclusive), in pixel units
01 ; There may be 0 or more extra cachelines to prefetch
[ $bpp <= 8
TEQ scratch, $leading_pixels_reg, LSL #3-$bpp_shift
|
TEQ scratch, $leading_pixels_reg, LSR #$bpp_shift-3
]
BEQ %FT02
ADD $leading_pixels_reg, $leading_pixels_reg, #32
[ DebugPld
Print Pld, "%p (trailing)\n", $leading_pixels_reg
]
PLD [$leading_pixels_reg]
B %BT01
02
]
MEND
MACRO
PreloadLine $base, $bpp, $bpp_shift, $tmp1, $tmp2
[ $bpp > 0
BIC $tmp1, $base, #31
[ $base = src
[ $bpp < 8
ADD $tmp2, x, bitptrs, LSR #27
|
ADD $tmp2, x, stride_s, LSR #30
]
|
[ $bpp < 8
AND $tmp2, bitptrs, #&1F
ADD $tmp2, x, $tmp2
|
ADD $tmp2, x, stride_d, LSR #30
]
]
SUB $tmp2, $tmp2, #1
[ DebugPld
Print Pld, "%p (line)\n", $tmp1
]
PLD [$tmp1]
[ $bpp < 8
ADD $tmp2, $base, $tmp2, LSR #3-$bpp_shift
|
ADD $tmp2, $base, $tmp2, LSL #$bpp_shift-3
]
BIC $tmp2, $tmp2, #31
CMP $tmp1, $tmp2
BEQ %FT02
01 ADD $tmp1, $tmp1, #32
CMP $tmp1, $tmp2
[ DebugPld
Print Pld, "%p (line)\n", $tmp1
]
PLD [$tmp1]
BNE %BT01
02
]
MEND
MACRO
AssignTmpReg $reg
LCLS candidate
WHILE {TRUE}
candidate LookupWk next_available_reg
next_available_reg SETA next_available_reg + 1
[ $candidate <> $tmp_leading_pixels \
:LAND: (((src_bpp = 0 :LOR: src_bpp >= 8) :LAND: dst_w_bpp >= 8) :LOR: $candidate <> bitptrs) \
:LAND: ((src_bpp > 0 :LAND: src_bpp < 8) :LOR: $candidate <> stride_s) \
:LAND: (dst_w_bpp < 8 :LOR: $candidate <> stride_d)
$reg SETS "$candidate"
MEXIT
]
WEND
MEND
MACRO
CalculateLeadingPixels
IF dst_w_bpp = 32
MOV scratch, dst, LSR #dst_bpp_shift-3
ANDS $tmp_leading_pixels, scratch, #&60 :SHR: dst_bpp_shift
ELIF dst_w_bpp = 16
MOV scratch, dst, LSR #dst_bpp_shift-3
AND scratch, scratch, #&60 :SHR: dst_bpp_shift
ORRS $tmp_leading_pixels, scratch, stride_d, LSR #30
ELIF dst_w_bpp = 8
AND scratch, dst, #&60 :SHR: dst_bpp_shift
ORRS $tmp_leading_pixels, scratch, stride_d, LSR #30
ELSE ; dst_w_bpp < 8
MOV scratch, dst, LSL #3-dst_bpp_shift
AND scratch, scratch, #&60 :SHR: dst_bpp_shift
AND $tmp_leading_pixels, bitptrs, #&1F
ORRS $tmp_leading_pixels, $tmp_leading_pixels, scratch
ENDIF
RSBNE $tmp_leading_pixels, $tmp_leading_pixels, #128/dst_w_bpp
Print Data, "Leading pixels = %u\n", $tmp_leading_pixels
MEND
MACRO
CalculateSkew
[ src_bpp > 0
[ dst_w_bpp > 4 * src_bpp
; When the destination is much wider than the source, gift a number of
; destination-cachelines-worth of pixels to the skew, to simplify the
; decision of which write operation we need to load the next word before
SUB scratch, x, $tmp_leading_pixels
AND scratch, scratch, #32/src_bpp - 128/dst_w_bpp
ADD $tmp_leading_pixels, $tmp_leading_pixels, scratch
]
[ src_bpp < 8
ADD skew, $tmp_leading_pixels, bitptrs, LSR #27
|
ADD skew, $tmp_leading_pixels, stride_s, LSR #30
]
Print Data, "Skew = %i pixels\n", skew
]
MEND
MACRO
DispatchSkew $label, $finalise_leading_pixels
fixed_skew SETA 0
[ src_bpp = 0
last_skew SETA 1
|
[ flags :AND: FLAG_NO_EXPAND_SKEW = 0
last_skew SETA 32
IF src_bpp = 32
[ "$finalise_leading_pixels" <> "" :LAND: $tmp_leading_pixels <> $leading_pixels_reg
MOV $leading_pixels_reg, $tmp_leading_pixels
]
; Do nothing, just drop into the skew = fixed 0 case
ELIF src_bpp = 16
TST skew, #1
[ "$finalise_leading_pixels" <> "" :LAND: $tmp_leading_pixels <> $leading_pixels_reg
MOV $leading_pixels_reg, $tmp_leading_pixels
]
BNE $label.00000010
ELIF src_bpp = 8
MOVS scratch, skew, LSL #31
[ "$finalise_leading_pixels" <> "" :LAND: $tmp_leading_pixels <> $leading_pixels_reg
MOV $leading_pixels_reg, $tmp_leading_pixels
]
BHI $label.00000018
BCS $label.00000010
BMI $label.00000008
ELSE
! 1, "Skew branch table not yet implemented for source < 8bpp"
ENDIF
|
last_skew SETA src_bpp
[ "$finalise_leading_pixels" <> "" :LAND: $tmp_leading_pixels <> $leading_pixels_reg
ASSERT $tmp_leading_pixels <> skew
MOV $leading_pixels_reg, $tmp_leading_pixels
]
[ src_bpp_shift > 0
MOV skew, skew, LSL #src_bpp_shift
]
AND skew, skew, #31
]
]
MEND
; Generated function entry conditions are:
; r0 = width (pixels)
; r1 = height (rows)
; r2 -> word containing top-left pixel of destination
; r3 bits 0-29 = destination stride (words), bits 30-31 = pixel index into first word (iff dest is >= 8bpp)
; [sp] = NULL, or -> word containing top-left pixel of source
; [sp,#4] = 0, or bits 0-29 = source stride (words), bits 30-31 = pixel index into first word (iff src is >= 8bpp)
; [sp,#8] = 0, or = halftone scalar, or -> after end of halftone vector
; [sp,#12] = 0, or bits 0-14 = -(vector length), bits 15-16 = 0, bits 17-31 = -(words remaining before wrap)
; [sp,#16] = NULL, or -> colour lookup table
; [sp,#20] bits 0-4 = pixel offset within first dest word (iff dest is < 8bpp)
; bits 27-31 = pixel offset within first source word (iff src is < 8bpp)
; [sp,#24...] any additional arguments: rule-dependent
;
; These map fairly naturally onto registers as follows:
x RN 0 ; pixels to go on current line
y RN 1 ; lines to go
dst RN 2
stride_d RN 3
src RN 4
stride_s RN 5
ht RN 6
ht_info RN 7
map RN 8
bitptrs RN 9
skew RN 10 ; for when it is passed to fast path in a register
orig_w RN 11 ; for restoring width (in pixels) at the start of each line - only used if x isn't in the list of line-saved registers
scratch RN 12
carry RN 14 ; for holding bits left over after skewing previous load - must be higher number than all work registers
; Main macro to generate a fast path function
MACRO
$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
src_bpp SETA $src_bpp
dst_w_bpp SETA $dst_w_bpp
flags SETA $flags
prefetch_distance SETA $prefetch_distance
line_saved_regs SETS "$line_saved_regs"
leading_pixels_reg SETS "$leading_pixels_reg"
preload_offset_reg SETS "$preload_offset_reg"
init SETS "$init"
newline SETS "$newline"
reinitwk SETS "$reinitwk"
cleanup SETS "$cleanup"
prefix SETS "$op.$src_bpp._$dst_w_bpp.$qualifier"
src_bpp_shift Log2 src_bpp
dst_bpp_shift Log2 dst_w_bpp
[ flags :AND: FLAG_DST_READWRITE = 0
dst_r_bpp SETA 0
|
dst_r_bpp SETA dst_w_bpp
]
sub_byte SETL (src_bpp > 0 :LAND: src_bpp < 8) :LOR: dst_w_bpp < 8
num_line_saved_regs CountRegsInList "$line_saved_regs"
LCLL SpilledX
SpilledX RegIsInList x, "$line_saved_regs"
LCLL SpilledY
SpilledY RegIsInList y, "$line_saved_regs"
AssignWk "$work_regs"
LCLA fixed_skew
LCLA last_skew
LCLA dst_prefetch_offset
LCLA subblock
LCLS label
LCLL do_preload
; Number of pixels per block is calculated such that in each block, there is
; * at least 1 16-byte write to destination
; * at least 1 32-byte preload of source (if source is used)
; * at least 1 32-byte preload of destination (if destination is read)
pix_per_block SETA 16*8/dst_w_bpp
[ src_bpp > 0
pix_per_block Max pix_per_block, 32*8/src_bpp
]
[ dst_r_bpp > 0
pix_per_block Max pix_per_block, 32*8/dst_r_bpp
]
[ VerboseBuild
! 0, "$prefix"
! 0, "pixels per block $pix_per_block"
! 0, "writes per block " :CC::STR:(pix_per_block*dst_w_bpp/8/16)
[ src_bpp > 0
! 0, "src preloads per block " :CC::STR:(pix_per_block*src_bpp/8/32)
]
[ dst_r_bpp > 0
! 0, "dst preloads per block " :CC::STR:(pix_per_block*dst_r_bpp/8/32)
]
]
LCLA next_available_reg
next_available_reg SETA 0
LCLS tmp_leading_pixels
[ (((src_bpp = 0 :LOR: src_bpp >= 8) :LAND: dst_w_bpp >= 8) :LOR: $leading_pixels_reg <> bitptrs) \
:LAND: ((src_bpp > 0 :LAND: src_bpp < 8) :LOR: $leading_pixels_reg <> stride_s) \
:LAND: (dst_w_bpp < 8 :LOR: $leading_pixels_reg <> stride_d) \
:LAND: $leading_pixels_reg <> skew
; No clash
tmp_leading_pixels SETS "$leading_pixels_reg"
[ VerboseBuild
! 0, "tmp_leading_pixels ":CC:tmp_leading_pixels:CC:" (= leading_pixels_reg)"
]
|
; Clash - need to hold leading_pixels temporarily in another register
tmp_leading_pixels SETS "pc" ; ensure no match with self
AssignTmpReg tmp_leading_pixels
[ VerboseBuild
! 0, "tmp_leading_pixels ":CC:tmp_leading_pixels
]
]
LCLS preload_src
[ src_bpp > 0
AssignTmpReg preload_src
[ VerboseBuild
! 0, "preload_src ":CC:preload_src
]
]
LCLS preload_dst
[ dst_r_bpp > 0 :LAND: flags :AND: FLAG_NO_PRELOAD_DST = 0
AssignTmpReg preload_dst
[ VerboseBuild
! 0, "preload_dst ":CC:preload_dst
]
]
EXPORT armSimd$prefix._wide
armSimd$prefix._wide
[ src_bpp > 0 :LOR: dst_r_bpp > 0
; Check whether this is actually a medium-width operation
; (decision made here rather in C due to availability of
; variables like prefetch_distance)
[ (prefetch_distance+3)*pix_per_block > 256
; Only slightly less likely to choose wide case, and uses valid immediate constant
CMP x, #(prefetch_distance+3)*pix_per_block
|
CMP x, #(prefetch_distance+3)*pix_per_block - 1
]
BLO armSimd$prefix._medium
FunctionPrologue WIDE, (prefetch_distance+2)*pix_per_block
51
PreloadLeadingStep1 $src_bpp, $preload_src, src
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
PreloadLeadingStep1 $dst_r_bpp, $preload_dst, dst
]
CalculateLeadingPixels
PreloadLeadingStep2 $src_bpp, $src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
PreloadLeadingStep2 $dst_r_bpp, $dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch
]
CalculateSkew
[ "$newline" <> ""
$prefix._$newline
]
DispatchSkew $prefix._wide_fork, finalise_leading_pixels
WHILE fixed_skew < last_skew
label SETS "$prefix._wide_fork" :CC: :STR: fixed_skew
$label
ProcessLeading127Bits $leading_pixels_reg, &$fixed_skew
[ dst_w_bpp > 4 * src_bpp
AND $leading_pixels_reg, $leading_pixels_reg, #127/dst_w_bpp
]
[ $leading_pixels_reg = x
LDR scratch, [sp]
SUB x, scratch, x
|
SUB x, x, $leading_pixels_reg
]
[ "$preload_offset_reg" <> ""
AND $preload_offset_reg, src, #&1C
RSB $preload_offset_reg, $preload_offset_reg, #prefetch_distance * 32
]
[ dst_r_bpp > 0 :LAND: flags :AND: FLAG_NO_PRELOAD_DST = 0
dst_prefetch_offset SETA -16
TST dst, #16
BNE %FT54
|
dst_prefetch_offset SETA 0
]
52
WHILE dst_prefetch_offset <= 0
subblock SETA 0
WHILE subblock < pix_per_block*dst_w_bpp/128
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
ASSERT flags & FLAG_MAX_256BIT_MACRO = 0
AddL scratch, x, (prefetch_distance+2)*pix_per_block + subblock*128/dst_w_bpp
TST scratch, #32/src_bpp - 128/dst_w_bpp
BNE %FT53
Read1Word src, 0, carry, &$fixed_skew, skew, scratch
53
]
[ flags & FLAG_MAX_256BIT_MACRO > 0
$prefix._256bits_head $wk0, &$fixed_skew, intra_preloads
|
$prefix._128bits_head $wk0, &$fixed_skew, intra_preloads
]
[ src_bpp > 0
[ flags & FLAG_MAX_256BIT_MACRO > 0
; prefetch distance = 256/bpp, block distance = 256/dst_w_bpp
do_preload IsEndOfGroup subblock, 256/256*dst_w_bpp/src_bpp
|
; prefetch distance = 256/bpp, block distance = 128/dst_w_bpp
do_preload IsEndOfGroup subblock, 256/128*dst_w_bpp/src_bpp
]
|
do_preload SETL {FALSE}
]
[ do_preload
PreloadMiddle
]
[ subblock :AND: 1 = 0 :LAND: dst_r_bpp > 0 :LAND: flags :AND: FLAG_NO_PRELOAD_DST = 0
; Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
; destination prefetches are 32-byte aligned. It's also the easiest channel to offset
; preloads for, to achieve staggered prefetches for multiple channels, because there are
; always two STMs per prefetch, so there is always an opposite STM on which to put the
; preload. Note, no need to BIC the base register here
[ DebugPld
ADD dst, dst, #prefetch_distance * 32 + dst_prefetch_offset
Print Pld, "%p (middle)\n", dst
SUB dst, dst, #prefetch_distance * 32 + dst_prefetch_offset
]
PLD [dst, #prefetch_distance * 32 + dst_prefetch_offset]
]
[ flags & FLAG_MAX_256BIT_MACRO > 0
$prefix._256bits_tail $wk0
subblock SETA subblock + 2
|
$prefix._128bits_tail $wk0
subblock SETA subblock + 1
]
WEND
SUBS x, x, #pix_per_block
BHS %BT52
[ dst_prefetch_offset < 0
B %FT55
54
]
dst_prefetch_offset SETA dst_prefetch_offset + 16
WEND
55
[ src_bpp = 0 :LAND: dst_r_bpp = 0
ADD x, x, #(prefetch_distance + 2) * pix_per_block - 128/dst_w_bpp
|
; Just before the final (prefetch_distance+1) blocks, deal with final preloads
[ (prefetch_distance + 2) * pix_per_block > 256
ADD x, x, #(prefetch_distance + 2) * pix_per_block
SUB x, x, #1
|
ADD x, x, #(prefetch_distance + 2) * pix_per_block - 1
]
PreloadTrailing $src_bpp, $src_bpp_shift, src, x, &$fixed_skew
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
PreloadTrailing $dst_r_bpp, $dst_bpp_shift, dst, x
]
SUB x, x, #128/dst_w_bpp - 1
]
; The remainder of this is the same as the medium case
56
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
MOV scratch, #32/src_bpp - 128/dst_w_bpp
BICS scratch, scratch, x
BNE %FT57
Read1Word src, 0, carry, &$fixed_skew, skew, scratch
57
]
$prefix._128bits_head $wk0, &$fixed_skew
$prefix._128bits_tail $wk0
SUBS x, x, #128/dst_w_bpp
BHS %BT56
58
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
; This is the only case where ProcessTrailing127Bits cares about bits of 128/dst_w_bpp or higher
ADD x, x, #128/dst_w_bpp
]
ProcessTrailing127Bits x, &$fixed_skew
[ fixed_skew < last_skew - src_bpp
B %FT59
]
fixed_skew SETA fixed_skew + src_bpp
WEND
59
FunctionEpilogue WIDE
LTORG
EXPORT armSimd$prefix._medium
armSimd$prefix._medium
]
FunctionPrologue NON_WIDE, 0
51
PreloadLine src, src_bpp, src_bpp_shift, scratch, carry
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
PreloadLine dst, dst_r_bpp, dst_bpp_shift, scratch, carry
]
CalculateLeadingPixels
CalculateSkew
[ "$newline" <> ""
$prefix._$newline
]
DispatchSkew $prefix._medium_fork, finalise_leading_pixels
WHILE fixed_skew < last_skew
label SETS "$prefix._medium_fork" :CC: :STR: fixed_skew
$label
; Here we know we have:
; 1) possible group of pixels right-aligned up to first destination block boundary
; 2) 0 or more complete destination blocks
; 3) possible group of pixels left-aligned up to last destination block boundary
ProcessLeading127Bits $leading_pixels_reg, &$fixed_skew
[ dst_w_bpp > 4 * src_bpp
AND $leading_pixels_reg, $leading_pixels_reg, #127/dst_w_bpp
]
[ $leading_pixels_reg = x
LDR scratch, [sp]
SUB x, scratch, x
|
SUB x, x, $leading_pixels_reg
]
SUBS x, x, #128/dst_w_bpp
BLO %FT58
56
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
MOV scratch, #32/src_bpp - 128/dst_w_bpp
BICS scratch, scratch, x
BNE %FT57
Read1Word src, 0, carry, &$fixed_skew, skew, scratch
57
]
$prefix._128bits_head $wk0, &$fixed_skew
$prefix._128bits_tail $wk0
SUBS x, x, #128/dst_w_bpp
BHS %BT56
58
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
; This is the only case where ProcessTrailing127Bits cares about bits of 128/dst_w_bpp or higher
ADD x, x, #128/dst_w_bpp
]
ProcessTrailing127Bits x, &$fixed_skew
[ src_bpp = 0
fixed_skew SETA fixed_skew + 1
|
[ fixed_skew < last_skew - src_bpp
B %FT59
]
fixed_skew SETA fixed_skew + src_bpp
]
WEND
59
FunctionEpilogue NON_WIDE
LTORG
EXPORT armSimd$prefix._narrow
armSimd$prefix._narrow FunctionPrologue NON_WIDE, 0
[ src_bpp > 0 :LAND: src_bpp < 32
; Because we're only aiming for 1-word alignment at the destination,
; we can at least have a constant skew for every scanline
[ dst_w_bpp < 8
ANDS skew, bitptrs, #&1F
|
MOVS skew, stride_d, LSR #30
]
[ src_bpp < 8
RSB skew, skew, bitptrs, LSR #27
|
RSB skew, skew, stride_s, LSR #30
]
[ src_bpp < dst_w_bpp
ADDNE skew, skew, #32/dst_w_bpp
]
]
Print Data, "Skew = %i pixels\n", skew
DispatchSkew $prefix._narrow_fork
WHILE fixed_skew < last_skew
label SETS "$prefix._narrow_fork" :CC: :STR: fixed_skew
$label
51
PreloadLine src, src_bpp, src_bpp_shift, scratch, carry
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
PreloadLine dst, dst_r_bpp, dst_bpp_shift, scratch, carry
]
[ "$newline" <> ""
$prefix._$newline
]
; Here we know we have:
; 1) possible group of pixels right-aligned up to first destination word boundary
; 2) 0 or more complete destination words
; 3) possible group of pixels left-aligned up to last destination word boundary
[ dst_w_bpp < 32
[ dst_w_bpp < 8
ANDS $leading_pixels_reg, bitptrs, #&1F
|
MOVS $leading_pixels_reg, stride_d, LSR #30
]
RSBNE $leading_pixels_reg, $leading_pixels_reg, #32/dst_w_bpp
Print Data, "Leading pixels = %u\n", $leading_pixels_reg
ProcessLeading31Bits $leading_pixels_reg, &$fixed_skew
|
ProcessLeading31Bits #0, &$fixed_skew
]
[ dst_w_bpp < 32
[ $leading_pixels_reg = x
LDR scratch, [sp]
SUB x, scratch, x
|
SUB x, x, $leading_pixels_reg
]
]
ProcessTrailing127Bits x, &$fixed_skew
FunctionEpilogue NON_WIDE
[ fixed_skew < last_skew - src_bpp :LAND: flags :AND: FLAG_SPILL_LINE_VARS_NON_WIDE > 0
args_stack_offset SETA args_stack_offset + num_line_saved_regs * 4
locals_stack_offset SETA locals_stack_offset + num_line_saved_regs * 4
]
[ src_bpp = 0
fixed_skew SETA fixed_skew + 1
|
fixed_skew SETA fixed_skew + src_bpp
]
WEND
LTORG
[ dst_w_bpp <= 8
EXPORT armSimd$prefix._tiny
armSimd$prefix._tiny FunctionPrologue NON_WIDE, 0
51
PreloadLine src, src_bpp, src_bpp_shift, scratch, carry
BIC scratch, dst, #31 ; loading dest is unconditional below
[ DebugPld
Print Pld, "%p (tiny dst)\n", dst
]
PLD [scratch] ; we know we're only working within one word, and therefore one cacheline
LCLS reg0
reg0 LookupWk 0
LCLS reg1
reg1 LookupWk 1
[ flags :AND: FLAG_PROCESS_PARALLEL > 0
ASSERT src_bpp = dst_w_bpp :LOR: src_bpp = 0
ASSERT $leading_pixels_reg <> $reg0
ASSERT $leading_pixels_reg <> $reg1
ASSERT $leading_pixels_reg <> skew
ASSERT $leading_pixels_reg <> carry
; Here we use the "leading pixels" register that is guaranteed
; to persist beyond the pixel processing to hold the bitmask of
; which bits of the destination word are preserved or updated.
[ dst_w_bpp = 1
RSB carry, x, #32
|
MOV carry, #32
SUB carry, carry, x, LSL #dst_bpp_shift
]
MOV $leading_pixels_reg, #-1
MOV $leading_pixels_reg, $leading_pixels_reg, LSL carry
[ dst_w_bpp < 8
AND skew, bitptrs, #&1F
|
MOV skew, stride_d, LSR #30
]
[ src_bpp > 0
LDR $reg0, [src], #4
]
MOV carry, skew, LSL #dst_bpp_shift
MOV $leading_pixels_reg, $leading_pixels_reg, LSR carry
Print Data, "Mask %08X\n", $leading_pixels_reg
[ src_bpp > 0
[ dst_w_bpp < 8
ASSERT $reg0 <> bitptrs
MOV carry, bitptrs, LSR #27
|
ASSERT $reg0 <> stride_s
MOV carry, stride_s, LSR #30
]
]
LDR $reg1, [dst], #4
[ src_bpp > 0
SUB skew, carry, skew
RSB carry, carry, #32/dst_w_bpp
[ dst_w_bpp > 1
MOV skew, skew, LSL #dst_bpp_shift
]
AND skew, skew, #31
CMP x, carry
LDRHI carry, [src], #4
MOVLS carry, $reg0
Print Data, "First source word %08X\n", $reg0
PrintHI Data, "Second source word %08X\n", carry
MOV $reg0, $reg0, LSL skew
Print Data, "Skew %u -> ", skew
RSB skew, skew, #32
ORR $reg0, $reg0, carry, LSR skew
Print Data, "%08X\n", $reg0
Print Data, "Dest word %08X\n", $reg1
]
[ "$newline" <> ""
$prefix._$newline
]
$prefix._32bits $wk0, $wk1, 0
Print Data, "After processing -> %08X\n", $wk0
AND $wk0, $wk0, $leading_pixels_reg
BIC $wk1, $wk1, $leading_pixels_reg
ORR $wk1, $wk0, $wk1
Print Data, "Masked to %08X\n", $wk1
STR $wk1, [dst, #-4]
|
; There are actually 3 state variables we need for iterating
; along such short lines one pixel at a time, but the process
; macro is still at liberty to use nearly all registers. So we
; squeeze them into the "leading pixels" register thus:
; bits 27-31: source bits until word reload (detect with C flag)
; bits 5-9: number of pixels to go
; bits 0-4: number of bits by which to rotate dest right at end
; If the source is 32bpp, the callee macro will do the load for us
LCLS dst_sz
dst_sz DecimalStr dst_w_bpp
[ dst_w_bpp < 8
AND carry, bitptrs, #&1F
|
MOV carry, stride_d, LSR #30
]
[ dst_bpp_shift > 0
MOV carry, carry, LSL #dst_bpp_shift
]
ADD scratch, carry, x, LSL #dst_bpp_shift
[ src_bpp > 0 :LAND: src_bpp < 32
ORR skew, scratch, x, LSL #5
RSB scratch, carry, #32
[ src_bpp < 8
MOV carry, bitptrs, LSR #27
|
MOV carry, stride_s, LSR #30
]
LDR $reg0, [src], #4
[ src_bpp_shift > 0
MOV carry, carry, LSL #src_bpp_shift
]
RSB carry, carry, #32
Print Data, "First source word %08X, ROR %u to MS", $reg0, carry
MOV $reg0, $reg0, ROR carry ; first src pixel to process now MS aligned
Print Data, " -> %08X\n", $reg0
SUB carry, carry, #src_bpp
ORR $leading_pixels_reg, skew, carry, LSL #27
|
ORR $leading_pixels_reg, scratch, x, LSL #5
RSB scratch, carry, #32
]
LDR $reg1, [dst], #4
Print Data, "Original destination word %08X, ROR %u to MS", $reg1, scratch
MOV $reg1, $reg1, ROR scratch ; first dst pixel to process now MS aligned
Print Data, " -> %08X\n", $reg1
[ DebugData
PUSH {src}
Print Data, "State word %08X = ", $leading_pixels_reg
MOV src, $leading_pixels_reg, LSR #27
Print Data, "%u, ", src
MOV src, $leading_pixels_reg, LSR #5
AND src, src, #&1F
Print Data, "%u, ", src
AND src, $leading_pixels_reg, #&1F
Print Data, "%u\n", src
POP {src}
]
[ "$newline" <> ""
$prefix._$newline
]
52
$prefix._$dst_sz.bits $wk0, $wk1, 0 ; this rotates source and dest registers left for us
[ src_bpp > 0 :LAND: src_bpp < 32
Print Data, "Source now %08X\n", $wk0
]
Print Data, "Destination now %08X\n", $wk1
SUB $leading_pixels_reg, $leading_pixels_reg, #1 :SHL: 5
TST $leading_pixels_reg, #31 :SHL: 5
[ src_bpp > 0 :LAND: src_bpp < 32
BEQ %FT53
SUBS $leading_pixels_reg, $leading_pixels_reg, #src_bpp :SHL: 27
Print Data, "State word now %08X\n", $leading_pixels_reg
LDRCC $reg0, [src], #4 ; carry is NOT(borrow) on ARM
PrintCC Data, "Load next source word: %08X\n", $reg0
B %BT52
|
BNE %BT52
]
53 AND scratch, $leading_pixels_reg, #&1F
MOV $reg1, $reg1, ROR scratch
Print Data, "Destination ROR %u -> %08X\n", scratch, $reg1
STR $reg1, [dst, #-4]
]
FunctionEpilogue NON_WIDE
LTORG
]
MEND
END
|