@;
@; coder.s
@; 08/05/01
@; P.Everett
@; Originally generated by gcc 2.7.2.1 (ARM/RISC OS v1.0.7) for ARM/RISC OS
@;
@;  Part of the Shine MP3 encoder for MidiCon
@;  16/9/2025
@;
@; This file contains the StrongArm analysis filters for the encoder.
@; It is split into two parts.
@; The subband filter and the mdct transform. These are called
@; separately from the c code. Inner loops have been expanded to reduce
@; execution time. 'c' comments are from the file coder.c
@; The subband filter matrix calculation has been simplified by exploiting
@; symmetry in the cosine coefficient tables.
@; I believe, although I have not seen it, that this is covered in the following
@; paper.
@; ref. Konstantinos Konstantinides, "Fast Subband Filtering in MPEG Audio Coding,"
@; IEEE Signal Processing Letters, Vol 32, No.2, pp.26-28, Feb 1994.
@; I haven't utilised all symmetries because the more complex one's didn't provide
@; any speed increase.
@;
@; Some parts of this file don't need to be in assembler but it's difficult to
@; split it up, whilst maintaining compatibility with the 'c' version.
@;
@;-------------------------------------------------------------------------------

@; offsets into config structure
.set CFG_WAVE_CHAN, 4*4   @; config.wave.channels
.set CFG_MPEG_CHAN, 29*4  @; config.mpeg.channels
.set CFG_MPEG_GRAN, 30*4  @; config.mpeg.granules


@        IMPORT  ca
@        IMPORT  cs
@        IMPORT  cos_l
@        IMPORT  fl
@        IMPORT  ew
@        IMPORT  config
@        IMPORT  z
@        IMPORT  x

        .data
off:    .word 0,0

        .text
@; address holders
x_adr:   .word  x
off_adr: .word  off
z_adr:   .word  z
ew_adr:  .word  ew
fl_adr:  .word  fl

        .align 2
        .global L3_window_filter_subband_sa
L3_window_filter_subband_sa:
        stmfd   sp!, {v1, v2, v3, v4, v5, v6, lr}
        mov     v3, a1                     @; buffer       , input
        mov     v6, a2                     @; s (sb_sample), output
        ldr     v2, off_adr                @; &off
        ldr     a2, [v2, a3, asl #2]       @; off[k]
@;---------------------------------
@; /* replace 32 oldest samples with 32 new samples */
@; for (i=31@;i>=0@;i--)
        ldr     a4, x_adr                  @; &x
        add     v4, a4, a3, asl #11        @; &x[k]
        add     v4, v4, a2, asl #2         @; &x[k][off[k]]
        ldr     v1, [v3, #0]               @; load buffer pointer

@; format depends on mode
        ldr     lr, cfg_adr
        ldr     a4, [lr, #CFG_MPEG_CHAN]   @; config.mpeg.channels
        cmp     a4, #2                     @; stereo output ?
        beq     stereo
        ldr     a4, [lr, #CFG_WAVE_CHAN]   @; config.wave.channels
        cmp     a4, #2                     @; stereo input ?
        beq     use_sum
        b       use_both

stereo:
        cmp     a3, #1
        beq     use_upper

@; stereo right
use_lower:
        mov     lr, #31                    @; i
lower_loop:
@; x[k][i+off[k]] = (*(*buffer)++) << 16@;

        ldr     a4, [v1], #4               @; *(*buffer)++
        mov     a4, a4, asl #16            @; left justify
        str     a4, [v4, lr, asl #2]       @; x[k][i+off[k]]
        subs    lr, lr, #1                 @; i--
        bpl     lower_loop
        b       input_end

@; stereo left
use_upper:
        mov     lr, #31                    @; i
upper_loop:
@; x[k][i+off[k]] = ((*(*buffer)++) >> 16) << 16@;

        ldr     a4, [v1], #4               @; *(*buffer)++
        mov     a4, a4, asr #16            @; right justify
        mov     a4, a4, asl #16            @; left justify
        str     a4, [v4, lr, asl #2]       @; x[k][i+off[k]]
        subs    lr, lr, #1                 @; i--
        bpl     upper_loop
        b       input_end

@; mono from stereo
use_sum:
        mov     lr, #31                    @; i
sum_loop:
        ldr     a4, [v1], #4               @; *(*buffer)++
        mov     a1, a4, asl #16
        mov     a1, a1, asr #16
        add     a1, a1, a4, asr #16
        mov     a1, a1, asl #15
        str     a1, [v4, lr, asl #2]       @; x[k][i+off[k]]
        subs    lr, lr, #1                 @; i--
        bpl     sum_loop
        b       input_end

@; mono
use_both:
        mov     lr, #31                    @; i
both_loop:
        ldr     a4, [v1], #4               @; *(*buffer)++
        mov     a1, a4, asl #16
        str     a1, [v4, lr, asl #2]       @; x[k][i+off[k]]
        sub     lr, lr, #1                 @; i--
        mov     a1, a4, asr #16
        mov     a1, a1, asl #16
        str     a1, [v4, lr, asl #2]       @; x[k][i+off[k]]
        subs    lr, lr, #1                 @; i--
        bpl     both_loop

input_end:
        str     v1, [v3, #0]               @; store buffer pointer
@;---------------------------------
@; /* shift samples into proper window positions */
@; for (i=HAN_SIZE@; i--@; )

        mov     lr, #508
        add     lr, lr, #3                 @; i
        mov     ip, lr                     @; 511
        ldr     v4, z_adr                  @; &z
        mov     a1, a3, asl #11            @; k*512*4 (ch)
        ldr     a4, x_adr                  @; &x
        add     v3, a1, a4                 @; &x[k]
        ldr     v5, ew_adr                 @; &ew
window_loop:
@; z[i] = x[k][(i+off[k])&(HAN_SIZE-1)] * ew[i]@;

        add     a4, lr, a2                 @; i+off[k]
        and     a4, a4, ip                 @; (i+off[k])&511
        ldr     a1, [v3, a4, asl #2]       @; x[k][(i+off[k])&511]
        ldr     a4, [v5, lr, asl #2]       @; ew[i]
        smull   v1, a4, a1, a4             @; x * ew
        str     a4, [v4, lr, asl #2]       @; z[i]
        subs    lr, lr, #1                 @; i--
        bpl     window_loop
@;---------------------------------
@;  off[k] = (off[k] + 480) & (HAN_SIZE-1)@; /* offset is modulo (HAN_SIZE)*/

        add     a2, a2, #480
        and     a2, a2, ip
        str     a2, [v2, a3, asl #2]       @; off[k]
@;---------------------------------
@; sub sample the windowed data, and combine subsamples for simplified
@; matrix calculation.

@; first add sub sample [16] that does not need combining
        ldr     a1, [v4, #64]
        ldr     a2, [v4, #320]
        add     a1, a1, a2
        ldr     a2, [v4, #576]
        add     a1, a1, a2
        ldr     a2, [v4, #832]
        add     a1, a1, a2
        ldr     a2, [v4, #1088]
        add     a1, a1, a2
        ldr     a2, [v4, #1344]
        add     a1, a1, a2
        ldr     a2, [v4, #1600]
        add     a1, a1, a2
        ldr     a2, [v4, #1856]
        add     a1, a1, a2
        str     a1, [v4, #64]

@; next combine the remaining 16 quadrant 1 & 2 sub samples
        mov     v2, v4      @; z increasing
        mov     v3, v4      @; z decreasing
        mov     lr, #16

sub_sample_loop_1:
        ldr     a1, [v2, #68]
        ldr     a2, [v2, #324]
        add     a1, a1, a2
        ldr     a2, [v2, #580]
        add     a1, a1, a2
        ldr     a2, [v2, #836]
        add     a1, a1, a2
        ldr     a2, [v2, #1092]
        add     a1, a1, a2
        ldr     a2, [v2, #1348]
        add     a1, a1, a2
        ldr     a2, [v2, #1604]
        add     a1, a1, a2
        ldr     a2, [v2, #1860]
        add     a1, a1, a2
@; add quadrant 1 to 2
        ldr     a2, [v3, #60]
        add     a1, a1, a2
        ldr     a2, [v3, #316]
        add     a1, a1, a2
        ldr     a2, [v3, #572]
        add     a1, a1, a2
        ldr     a2, [v3, #828]
        add     a1, a1, a2
        ldr     a2, [v3, #1084]
        add     a1, a1, a2
        ldr     a2, [v3, #1340]
        add     a1, a1, a2
        ldr     a2, [v3, #1596]
        add     a1, a1, a2
        ldr     a2, [v3, #1852]
        add     a1, a1, a2

        str     a1, [v2, #68]
        add     v2, v2, #4
        sub     v3, v3, #4
        subs    lr, lr, #1
        bne     sub_sample_loop_1

@; next combine the 15 sub samples from quadrant 3 & 4
        mov     v2, v4      @; z increasing
        mov     v3, v4      @; z decreasing
        mov     lr, #15

sub_sample_loop_2:
        ldr     a1, [v2, #132]
        ldr     a2, [v2, #388]
        add     a1, a1, a2
        ldr     a2, [v2, #644]
        add     a1, a1, a2
        ldr     a2, [v2, #900]
        add     a1, a1, a2
        ldr     a2, [v2, #1156]
        add     a1, a1, a2
        ldr     a2, [v2, #1412]
        add     a1, a1, a2
        ldr     a2, [v2, #1668]
        add     a1, a1, a2
        ldr     a2, [v2, #1924]
        add     a1, a1, a2
@; subtract quadrant 4 from 3
        ldr     a2, [v3, #252]
        sub     a1, a1, a2
        ldr     a2, [v3, #508]
        sub     a1, a1, a2
        ldr     a2, [v3, #764]
        sub     a1, a1, a2
        ldr     a2, [v3, #1020]
        sub     a1, a1, a2
        ldr     a2, [v3, #1276]
        sub     a1, a1, a2
        ldr     a2, [v3, #1532]
        sub     a1, a1, a2
        ldr     a2, [v3, #1788]
        sub     a1, a1, a2
        ldr     a2, [v3, #2044]
        sub     a1, a1, a2

        str     a1, [v2, #132]
        add     v2, v2, #4
        sub     v3, v3, #4
        subs    lr, lr, #1
        bne     sub_sample_loop_2

@;---------------------------------
@; polyphase filter matrix multiplication
@; we only perform the calculation on the 31 samples of quadrants 2 & 3
@; also, bands 31..16 are calculated at the same time as 0..15 using the
@; symmetry,  fl[31-i][[j] = -1**j * fl[i][j]
@; There is another symmetry that can be applied but it is rather complex
@; and doesn't provide any speed increase in this instance. It is debatable
@; whether the applied mod is actually worth it here.

@;  for (i=0@; i<16@; i++)
@;    for (j=32, s[i]= 0@; j--@; )
@;      s[i] += fl[i][j] * z[j+16]@;

        mov     lr, #15
        mov     v5, v6
        add     v6, v6, #124
        ldr     v3, fl_adr        @; &fl

@; registers,
@; v3 = &fl        (coefficients)
@; v4 = &z         (input data)
@; a4a3 = band 0..15 accumulator
@; v2v1 = band 31..16 accumulator
@; a1 a2 ip = temp
@; v5 = &sb_sample[0..15]  (output data increasing band)
@; v6 = &sb_sample[31..16] (output data decreasing band)
@; lr = loop counter (band)

        .macro matrix n
@;1,3,5,... (odd)
        ldr     a2, [v3], #4
        ldr     a1, [v4, #64+4*\n]
        smull   a2, ip, a1, a2
        adds    a3, a3, a2
        adc     a4, a4, ip
        subs    v1, v1, a2
        sbc     v2, v2, ip
@;2,4,6,... (even)
        ldr     a2, [v3], #4
        ldr     a1, [v4, #68+4*\n]
        smull   a2, ip, a1, a2
        adds    a3, a3, a2
        adc     a4, a4, ip
        adds    v1, v1, a2
        adc     v2, v2, ip
        .endm

matrix_loop:
@;       matrix 0
        ldr     a2, [v3], #4
        ldr     a1, [v4, #64]
        smull   a3, a4, a1, a2
        mov     v1, a3
        mov     v2, a4

        matrix 1
        matrix 3
        matrix 5
        matrix 7
        matrix 9
        matrix 11
        matrix 13
        matrix 15
        matrix 17
        matrix 19
        matrix 21
        matrix 23
        matrix 25
        matrix 27
        matrix 29
@;31
        ldr     a2, [v3], #4
        ldr     a1, [v4, #64+4*31]
        smull   a2, ip, a1, a2
        adds    a3, a3, a2
        adc     a4, a4, ip
        subs    v1, v1, a2
        sbc     v2, v2, ip

        adds    a3, a3, #0x80000000
        adc     a4, a4, #0
        str     a4, [v5], #4

        adds    v1, v1, #0x80000000
        adc     v2, v2, #0
        str     v2, [v6], #-4

        subs    lr, lr, #1
        bpl     matrix_loop
@;
        ldmfd   sp!, {v1, v2, v3, v4, v5, v6, pc}

@;-----------------------------------------------------------------


@; MDCT function
@; -------------

@; address holders
cfg_adr:   .word  config
cos_l_adr: .word  cos_l
cs_adr:    .word  cs
ca_adr:    .word  ca

        .align 2
        .global L3_mdct_sub_sa
L3_mdct_sub_sa:
        stmfd   sp!, {v1, v2, v3, v4, v5, v6, lr}
        sub     sp, sp, #28
        str     a1, [sp, #0]    @; sb_sample
@;  for(gr=0@; gr<2@; gr++)
        mov     v1, #0
        str     v1, [sp, #4]    @; gr
        str     a2, [sp, #24]   @; mdct_freq
        str     v1, [sp, #20]
@;    for(ch=config.mpeg.channels@; ch--@; )
mdct_gr_loop:
        ldr     a4, cfg_adr
        ldr     v2, [a4, #CFG_MPEG_CHAN] @; config.mpeg.channels (1..2)
        sub     v2, v2, #1      @; ch (0..1)
        cmn     v2, #1
        beq     L_35
@;      /* set up pointer to the part of mdct_freq we're using */
@;      mdct_enc = (long (*)[18]) mdct_freq[gr][ch]@;
        add     a4, v2, v2, asl #1
        add     a4, a4, a4, asl #3
        ldr     v5, [sp, #0]
        add     a4, v5, a4, asl #8
        str     a4, [sp, #12]
        ldr     v1, [sp, #20]
        str     v1, [sp, #8]
        add     a4, v2, v2, asl #3
        ldr     v1, [sp, #24]
        add     a4, v1, a4, asl #8
        str     a4, [sp, #16]
L_40:
@;      /* Compensate for inversion in the analysis filter
@;       * (every odd index of band AND k)
@;       */
@;      for(band=1@; band<=31@; band+=2 )
        mov     lr, #1
invert_band:
@;        for(k=1@; k<=17@; k+=2 )
        mov     ip, #1
        mov     a3, lr, asl #2
        ldr     v5, [sp, #8]
        ldr     v1, [sp, #12]
        add     a4, v5, v1
        add     a2, a4, #128
invert_loop:
@;          sb_sample[ch][gr+1][k][band] *= -1@;
        add     a1, a3, a2
        add     a2, a2, #256
        ldr     a4, [a1, #2304]
        rsb     a4, a4, #0
        str     a4, [a1, #2304]
        add     ip, ip, #2
        cmp     ip, #17
        ble     invert_loop
        add     lr, lr, #2
        cmp     lr, #31
        ble     invert_band

@;------------------------------------
@;      /* Perform imdct of 18 previous subband samples + 18 current subband samples */
@;      for(band=32@; band--@; )
        mov     lr, #31
        ldr     v3, [sp, #16]   @; mdct_freq[gr][ch] ([band][k])
        ldr     v4, [sp, #8]    @; gr*2304
        ldr     v1, [sp, #12]   @; sb_sample[ch]
        add     v1, v1, v4      @; sb_sample[ch][gr] (k][band])

mdct_band_loop:
@; Calculation of the MDCT
@; In the case of long blocks ( block_type 0,1,3 ) there are
@; 36 coefficients in the time domain and 18 in the frequency
@; domain.
@;
@;        for(k=18@; k--@; )
@;        {
@;          m = &mdct_enc[band][k]@;
@;          for(j=36, *m=0@; j--@; )
@;            *m += mdct_in[j] * cos_l[k][j]@;
@;        }

        mov     ip, #17
        ldr     v4, cos_l_adr         @; &cos_l

@; note. because of the way that the granules are declared in the data array,
@; this code will automatically use samples from the last and current granules
@; directly without having to copy them to a separate array first.
mdct_inner_loop:
@; JRF: AFAICT, a3, v6, sl are free in this loop
@;      Because of this, I can load values from the v4 array using an LDMIA
@;      rather than loading them invididually.
@;      Note: I'm not going to use sl in this version of the code because I
@;            cannot guarentee that this section of code will not abort. If
@;            it were to abort and sl were corrupt then Bad Things (TM) would
@;            happen. It would be much more efficient if we could guarentee
@;            that we would not abort in here because we could pull 4 registers
@;            rather than just 3.
@;            I would do this as a macro, but I don't know if GCC can cope with
@;            that.
@; PRE: 20/4/03 created macro, assembles ok with both GCC and Norcroft.

        ldr     a2, [v1], #128
        LDMIA   v4!, {a1,a3,v6}
        smull   v5, a4, a1, a2        @; first one doesn't accumulate
        ldr     a2, [v1], #128
        smlal   v5, a4, a3, a2
        ldr     a2, [v1], #128
        smlal   v5, a4, v6, a2

        .macro mdct_inner
        ldr     a2, [v1], #128
        LDMIA   v4!, {a1,a3,v6}
        smlal   v5, a4, a1, a2
        ldr     a2, [v1], #128
        smlal   v5, a4, a3, a2
        ldr     a2, [v1], #128
        smlal   v5, a4, v6, a2
        .endm

        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner
        mdct_inner

        adds    v5, v5, #0x80000000
        adc     a4, a4, #0

        str     a4, [v3], #4    @; mdct_freq
        sub     v1, v1, #4608   @; sb_sample
        subs    ip, ip, #1
        bpl     mdct_inner_loop

        add     v1, v1, #4    @; sb_sample
        subs    lr, lr, #1
        bpl     mdct_band_loop

@;------------------------------------
@; Aliasing reduction butterfly

        .macro alias_red n
        ldr     a3, [v5, #4*\n]                 @; cs[k]
        ldr     v3, [v4, #4*\n]                 @; ca[k]
        ldr     a2, [v6, #4*17-4*\n]            @; mdct_enc[band][17-k]
        ldr     a1, [v6, #4*18+4*\n]            @; mdct_enc[band+1][k]
@; mdct_enc[band][17-k] = mdct_enc[band][17-k] * cs[k] + mdct_enc[band+1][k] * ca[k]@;
        smull   v1, a4, a3, a2
        smlal   v1, a4, v3, a1
        adds    v1, v1, v1                      @; left justify the result
        adc     a4, a4, a4
        adds    v1, v1, #0x80000000             @; round the result
        adc     a4, a4, #0
        str     a4, [v6, #4*17-4*\n]            @; mdct_enc[band][17-k]
@; mdct_enc[band+1][k] = mdct_enc[band+1][k] * cs[k] - mdct_enc[band][17-k] * ca[k]@;
        rsb     v3, v3, #0                      @; negate ca[k] so the smlal subtracts
        smull   v1, a4, a3, a1
        smlal   v1, a4, v3, a2
        adds    v1, v1, v1                      @; left justify the result
        adc     a4, a4, a4
        adds    v1, v1, #0x80000000             @; round the result
        adc     a4, a4, #0
        str     a4, [v6, #4*18+4*\n]            @; mdct_enc[band+1][k]
        .endm


@;      /* Perform aliasing reduction butterfly */
@;      for(band=31@; band--@; )
        mov     lr, #30                         @; band
        ldr     v5, [sp, #16]                   @; &mdct_enc
        add     v6, v5, #2160                   @; &mdct_enc[30][0]
        ldr     v5, cs_adr                      @; &cs
        ldr     v4, ca_adr                      @; &ca

alias_red_loop:
@; for(k=8@; k--@; )
        alias_red 7
        alias_red 6
        alias_red 5
        alias_red 4
        alias_red 3
        alias_red 2
        alias_red 1
        alias_red 0


        sub     v6, v6, #72                     @; &mdct[band--][0]
        subs    lr, lr, #1                      @; band--
        bpl     alias_red_loop

@;---------------------------------------
        ldr     v5, [sp, #12]
        sub     v5, v5, #6912
        ldr     v1, [sp, #16]
        str     v5, [sp, #12]
        sub     v1, v1, #2304
        str     v1, [sp, #16]
        subs    v2, v2, #1
        bpl     L_40
L_35:
        ldr     v5, [sp, #20]
        add     v5, v5, #2304
        ldr     v1, [sp, #24]
        str     v5, [sp, #20]
        add     v1, v1, #4608
        str     v1, [sp, #24]
@; exit here in mpeg 2 or 2.5 (only 1 granule)
        ldr     v5, cfg_adr
        ldr     v5, [v5, #CFG_MPEG_GRAN]      @; config.mpeg.granules
        cmp     v5, #1
        beq     mdct_exit
@; check for second granule for mpeg 1
        ldr     v5, [sp, #4]
        add     v5, v5, #1
        str     v5, [sp, #4]
        cmp     v5, #1
        ble     mdct_gr_loop
mdct_exit:

@;------------------------------------------------
@;  Save latest granule's subband samples to be used in the next mdct call
@;  for(ch=config.mpeg.channels @;ch--@; )
@;    for(j=18@; j--@; )
@;      for(band=32@; band--@; )
@;        sb_sample[ch][0][j][band] = sb_sample[ch][config.mpeg.granules][j][band]@;

        ldr     a1, cfg_adr
        ldr     v6, [a1, #CFG_MPEG_CHAN]   @; config.mpeg.channels (1..2)
        ldr     ip, [sp, #0]               @; sb_sample[0][0]
@; add 2304, then add another 2304 if mpeg 1
        add     lr, ip, #2304              @; sb_sample[0][1]  (mpeg 2/2.5)
        ldr     a2, [a1, #CFG_MPEG_GRAN]   @; config.mpeg.granules (1..2)
        cmp     a2, #2
        addeq   lr, lr, #2304              @; sb_sample[0][2]  (mpeg 1)
save_gr_ch_loop:
        mov     v5, #18
save_gr_inner_loop:
        ldmia   lr!, {a1, a2, a3, a4, v1, v2, v3, v4}
        stmia   ip!, {a1, a2, a3, a4, v1, v2, v3, v4}
        ldmia   lr!, {a1, a2, a3, a4, v1, v2, v3, v4}
        stmia   ip!, {a1, a2, a3, a4, v1, v2, v3, v4}
        ldmia   lr!, {a1, a2, a3, a4, v1, v2, v3, v4}
        stmia   ip!, {a1, a2, a3, a4, v1, v2, v3, v4}
        ldmia   lr!, {a1, a2, a3, a4, v1, v2, v3, v4}
        stmia   ip!, {a1, a2, a3, a4, v1, v2, v3, v4}
        subs    v5, v5, #1
        bne     save_gr_inner_loop

        add     ip, ip, #4608
        add     lr, lr, #4608
        subs    v6, v6, #1
        bne     save_gr_ch_loop

        add     sp, sp, #28
        ldmfd   sp!, {v1, v2, v3, v4, v5, v6, pc}

@;------------------------------------------------------
@;------------------------------------------------------
        .align 2
        .end
