@;
@; Desktop plotters
@; ----------------
@;
@; Displays a rectangle of the supplied sprite data. Scales the display up
@; or down independently in x and y axis. Magnification factors are fixed
@; point fractional.
@;   output_size = input_size / magnification
@; Functions provided for colour depths of 8, 12, 15, 16, 24 bpp.
@; Addresses need not be word aligned for 8bpp, 12bpp, 15bpp and 16bpp.
@; Does not work for negative mags.
@;
@; inputs, r0 = &blk
@;   blk+0  -> r0 = x magnification
@;   blk+4  -> r1 = y magnification
@;   blk+8  -> r2 = dest rectangle x offset in pixels
@;   blk+12 -> r3 = dest rectangle y offset
@;   blk+16 -> r4 = dest rectangle width in pixels (x loop count)
@;   blk+20 -> r5 = dest rectangle height (y loop count)
@;   blk+24 -> r6 = dest rectangle top left address
@;   blk+28 -> r7 = screen width in bytes
@;   blk+32 -> r8 = sprite top left address
@;   blk+36 -> r9 = sprite width in bytes
@;
@; Dest offsets are relative to the display window.
@; Source format is expected to be 24bpp 8:8:8 rgb (4 bytes per pixel).
@; r5 is used for both x & y counts, (upper half for x).


.set frac,  16 @; fractional bits in x/y magnification
.set split, 16 @; number of bits of r5 used for y count

    .text
    .align 2

gcol: .word rgb2gcol @; palette for 256 colour modes

@;-----------------------------------------------------------

@; destination depth = 8bpp (uses palette)
plot8bpp:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #12
    ldmia   r0, {r0-r9}             @; load registers from blk
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
    str     r7, [sp, #8]
yloop8:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop8
@; store up to 3 single pixels
xloop8_1:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 8bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4
    ldr     r1, gcol
    ldrb    r11, [r1, r10]

    strb    r11, [r12], #1          @; display pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend8
    tst     r12, #3                 @; check dest alignment
    bne     xloop8_1
xloop8:
    cmn     r5, #3<<split           @; check for a width less than 4
    bge     xloop8_1
@; store a word of 4 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 8bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4
    ldr     r1, gcol
    ldrb    r7, [r1, r10]

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 8bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4
    ldr     r1, gcol
    ldrb    r11, [r1, r10]
    orr     r7, r7, r11, lsl #8

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 8bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4
    ldr     r1, gcol
    ldrb    r11, [r1, r10]
    orr     r7, r7, r11, lsl #16

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 8bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4
    ldr     r1, gcol
    ldrb    r11, [r1, r10]
    orr     r7, r7, r11, lsl #24

    str     r7, [r12], #4           @; display 4 pixels
    adds    r5, r5, #4<<split       @; decrement x count by 4 pixels
    ble     xloop8
xend8:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    ldr     r7, [sp, #8]
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop8
    add     sp, sp, #12
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 12bpp 4:4:4 bgr
plot12bpp_bgr:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #1              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
yloop12bgr:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop12bgr
@; store a single pixel
xloop12_1bgr:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 12bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4
    strb    r10, [r12], #1          @; display left half of pixel
    mov     r10, r10, lsr #8
    strb    r10, [r12], #1          @; display right half of pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend12bgr
xloop12bgr:
    cmn     r5, #1<<split           @; check for a width less than 2
    bge     xloop12_1bgr
@; store a word of 2 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 12bpp
    and     r1, r11, #0x00f00000
    mov     r10, r1, lsr #20
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #4

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 12bpp
    and     r1, r11, #0x00f00000
    orr     r10, r10, r1, lsr #4
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsl #8
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #20
    str     r10, [r12], #4          @; display 2 pixels
    adds    r5, r5, #2<<split       @; decrement x count by 2 pixels
    ble     xloop12bgr
xend12bgr:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop12bgr
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 12bpp 4:4:4 rgb
plot12bpp_rgb:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #1              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
yloop12rgb:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop12rgb
@; store a single pixel
xloop12_1rgb:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 12bpp
    and     r1, r11, #0x000000f0
    mov     r10, r1, lsr #4
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x00f00000
    orr     r10, r10, r1, lsr #12
    strb    r10, [r12], #1          @; display left half of pixel
    mov     r10, r10, lsr #8
    strb    r10, [r12], #1          @; display right half of pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend12rgb
xloop12rgb:
    cmn     r5, #1<<split           @; check for a width less than 2
    bge     xloop12_1rgb
@; store a word of 2 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 12bpp
    and     r1, r11, #0x000000f0
    mov     r10, r1, lsr #4
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsr #8
    and     r1, r11, #0x00f00000
    orr     r10, r10, r1, lsr #12

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 12bpp
    and     r1, r11, #0x000000f0
    orr     r10, r10, r1, lsl #12
    and     r1, r11, #0x0000f000
    orr     r10, r10, r1, lsl #8
    and     r1, r11, #0x00f00000
    orr     r10, r10, r1, lsl #4
    str     r10, [r12], #4          @; display 2 pixels
    adds    r5, r5, #2<<split       @; decrement x count by 2 pixels
    ble     xloop12rgb
xend12rgb:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop12rgb
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 15bpp 5:5:5 bgr
plot15bpp_bgr:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #1              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
yloop15bgr:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop15bgr
@; store a single pixel
xloop15_1bgr:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 15bpp
    and     r1, r11, #0x00f80000
    mov     r10, r1, lsr #19
    and     r1, r11, #0x0000f800
    orr     r10, r10, r1, lsr #6
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #7
    strb    r10, [r12], #1          @; display left half of pixel
    mov     r10, r10, lsr #8
    strb    r10, [r12], #1          @; display right half of pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend15bgr
xloop15bgr:
    cmn     r5, #1<<split           @; check for a width less than 2
    bge     xloop15_1bgr
@; store a word of 2 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 15bpp
    and     r1, r11, #0x00f80000
    mov     r10, r1, lsr #19
    and     r1, r11, #0x0000f800
    orr     r10, r10, r1, lsr #6
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #7

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 15bpp
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsr #3
    and     r1, r11, #0x0000f800
    orr     r10, r10, r1, lsl #10
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #23
    str     r10, [r12], #4          @; display 2 pixels
    adds    r5, r5, #2<<split       @; decrement x count by 2 pixels
    ble     xloop15bgr
xend15bgr:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop15bgr
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 15bpp 5:5:5 rgb
plot15bpp_rgb:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #1              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
yloop15rgb:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop15rgb
@; store a single pixel
xloop15_1rgb:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 15bpp
    and     r1, r11, #0x000000f8
    mov     r10, r1, lsr #3
    and     r1, r11, #0x0000f800
    orr     r10, r10, r1, lsr #6
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsr #9
    strb    r10, [r12], #1          @; display left half of pixel
    mov     r10, r10, lsr #8
    strb    r10, [r12], #1          @; display right half of pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend15rgb
xloop15rgb:
    cmn     r5, #1<<split           @; check for a width less than 2
    bge     xloop15_1rgb
@; store a word of 2 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 15bpp
    and     r1, r11, #0x000000f8
    mov     r10, r1, lsr #3
    and     r1, r11, #0x0000f800
    orr     r10, r10, r1, lsr #6
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsr #9

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 15bpp
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #13
    and     r1, r11, #0x0000f800
    orr     r10, r10, r1, lsl #10
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsl #7
    str     r10, [r12], #4          @; display 2 pixels
    adds    r5, r5, #2<<split       @; decrement x count by 2 pixels
    ble     xloop15rgb
xend15rgb:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop15rgb
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 16bpp 5:6:5 bgr
plot16bpp_bgr:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #1              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
yloop16bgr:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop16bgr
@; store a single pixel
xloop16_1bgr:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 16bpp
    and     r1, r11, #0x00f80000
    mov     r10, r1, lsr #19
    and     r1, r11, #0x0000fc00
    orr     r10, r10, r1, lsr #5
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #8
    strb    r10, [r12], #1          @; display left half of pixel
    mov     r10, r10, lsr #8
    strb    r10, [r12], #1          @; display right half of pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend16bgr
xloop16bgr:
    cmn     r5, #1<<split           @; check for a width less than 2
    bge     xloop16_1bgr
@; store a word of 2 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 16bpp
    and     r1, r11, #0x00f80000
    mov     r10, r1, lsr #19
    and     r1, r11, #0x0000fc00
    orr     r10, r10, r1, lsr #5
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #8

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 16bpp
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsr #3
    and     r1, r11, #0x0000fc00
    orr     r10, r10, r1, lsl #11
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #24
    str     r10, [r12], #4          @; display 2 pixels
    adds    r5, r5, #2<<split       @; decrement x count by 2 pixels
    ble     xloop16bgr
xend16bgr:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop16bgr
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 16bpp 5:6:5 rgb
plot16bpp_rgb:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #1              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    str     r2, [sp]
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp, #4]
yloop16rgb:
    sub     r5, r5, r4, lsl #split  @; width count
    ldr     r2, [sp]                @; initial Xoffset
    mov     r14, r3, asr #frac      @; (int)Yoffset
    mla     r14, r9, r14, r8        @; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 @; dest line start
    tst     r12, #3                 @; check dest alignment
    beq     xloop16rgb
@; store a single pixel
xloop16_1rgb:
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 16bpp
    and     r1, r11, #0x000000f8
    mov     r10, r1, lsr #3
    and     r1, r11, #0x0000fc00
    orr     r10, r10, r1, lsr #5
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsr #8
    strb    r10, [r12], #1          @; display left half of pixel
    mov     r10, r10, lsr #8
    strb    r10, [r12], #1          @; display right half of pixel
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend16rgb
xloop16rgb:
    cmn     r5, #1<<split           @; check for a width less than 2
    bge     xloop16_1rgb
@; store a word of 2 pixels
    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 16bpp
    and     r1, r11, #0x000000f8
    mov     r10, r1, lsr #3
    and     r1, r11, #0x0000fc00
    orr     r10, r10, r1, lsr #5
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsr #8

    mov     r11, r2, asr #frac      @; (int)Xoffset
    ldr     r11, [r14, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              @; increment Xoffset
@; translate to 16bpp
    and     r1, r11, #0x000000f8
    orr     r10, r10, r1, lsl #13
    and     r1, r11, #0x0000fc00
    orr     r10, r10, r1, lsl #11
    and     r1, r11, #0x00f80000
    orr     r10, r10, r1, lsl #8
    str     r10, [r12], #4          @; display 2 pixels
    adds    r5, r5, #2<<split       @; decrement x count by 2 pixels
    ble     xloop16rgb
xend16rgb:
    ldr     r1, [sp, #4]
    add     r3, r3, r1              @; increment Yoffset
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop16rgb
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 24bpp 8:8:8 bgr (uses 32bpp)
plot32bpp_bgr:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #3              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp]
    str     r7, [sp, #4]
yloop32bgr:
    sub     r5, r5, r4, lsl #split  @; width count
    mov     r14, r2                 @; initial Xoffset
    mov     r10, r3, asr #frac      @; (int)Yoffset
    mla     r10, r9, r10, r8        @; src_addr += src_width * Yoffset
    mov     r12, r6                 @; dest line start
xloop32bgr:
@; note, do 4 pixels in line if possible to save a few cycles.
    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    and     r7, r11, #0x0000ff00
    and     r1, r11, #0x000000ff
    orr     r7, r7, r1, lsl #16
    and     r1, r11, #0x00ff0000
    orr     r7, r7, r1, lsr #16
    str     r7, [r12], #4           @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend32bgr

    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    and     r7, r11, #0x0000ff00
    and     r1, r11, #0x000000ff
    orr     r7, r7, r1, lsl #16
    and     r1, r11, #0x00ff0000
    orr     r7, r7, r1, lsr #16
    str     r7, [r12], #4           @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend32bgr

    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    and     r7, r11, #0x0000ff00
    and     r1, r11, #0x000000ff
    orr     r7, r7, r1, lsl #16
    and     r1, r11, #0x00ff0000
    orr     r7, r7, r1, lsr #16
    str     r7, [r12], #4           @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend32bgr

    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    and     r7, r11, #0x0000ff00
    and     r1, r11, #0x000000ff
    orr     r7, r7, r1, lsl #16
    and     r1, r11, #0x00ff0000
    orr     r7, r7, r1, lsr #16
    str     r7, [r12], #4           @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    ble     xloop32bgr

xend32bgr:
    ldr     r1, [sp]
    add     r3, r3, r1              @; increment Yoffset
    ldr     r7, [sp, #4]
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop32bgr
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; destination depth = 32bpp 8:8:8 rgb (uses 32bpp)
plot32bpp_rgb:
    stmfd   sp!, {r4-r12, lr}
    sub     sp, sp, #8
    ldmia   r0, {r0-r9}             @; load registers from blk
    bic     r6, r6, #3              @; ensure pixel alignment
    mul     r2, r0, r2              @; initial Xoffset
    mul     r3, r1, r3              @; initial Yoffset
    str     r1, [sp]
    str     r7, [sp, #4]
yloop32rgb:
    sub     r5, r5, r4, lsl #split  @; width count
    mov     r14, r2                 @; initial Xoffset
    mov     r10, r3, asr #frac      @; (int)Yoffset
    mla     r10, r9, r10, r8        @; src_addr += src_width * Yoffset
    mov     r12, r6                 @; dest line start
xloop32rgb:
@; note, do 4 pixels in line if possible to save a few cycles.
    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    str     r11, [r12], #4          @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend32rgb

    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    str     r11, [r12], #4          @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend32rgb

    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    str     r11, [r12], #4          @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    bgt     xend32rgb

    mov     r11, r14, asr #frac     @; (int)Xoffset
    ldr     r11, [r10, r11, lsl #2] @; pixel = *(src_addr + (int)Xoffset)
@; translate to 32bpp
    str     r11, [r12], #4          @; display pixel
    add     r14, r14, r0            @; increment Xoffset
    adds    r5, r5, #1<<split       @; decrement x count by 1 pixel
    ble     xloop32rgb

xend32rgb:
    ldr     r1, [sp]
    add     r3, r3, r1              @; increment Yoffset
    ldr     r7, [sp, #4]
    add     r6, r6, r7              @; next dest line start
    subs    r5, r5, #1              @; decrement y count by 1 pixel
    bgt     yloop32rgb
    add     sp, sp, #8
    ldmfd   sp!, {r4-r12, pc}

@;-----------------------------------------------------------

@; Plot function array
    .global plot
plot:
    .word plot8bpp       @; PLOT_256
    .word plot12bpp_bgr  @; PLOT_4K_BGR
    .word plot12bpp_rgb  @; PLOT_4K_RGB
    .word plot15bpp_bgr  @; PLOT_32K_BGR
    .word plot15bpp_rgb  @; PLOT_32K_RGB
    .word plot16bpp_bgr  @; PLOT_64K_BGR
    .word plot16bpp_rgb  @; PLOT_64K_RGB
    .word plot32bpp_bgr  @; PLOT_16M_BGR
    .word plot32bpp_rgb  @; PLOT_16M_RGB

@;-----------------------------------------------------------

