// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

// ----------------------------------------------------------------------------
// Multiply modulo p_25519, z := (x * y) mod p_25519
// Inputs x[4], y[4]; output z[4]
//
//    extern void bignum_mul_p25519(uint64_t z[static 4], const uint64_t x[static 4],
//                                  const uint64_t y[static 4]);
//
// Standard ARM ABI: X0 = z, X1 = x, X2 = y
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_mul_p25519)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519)
        .text
        .balign 4

#define z x0
#define x x1
#define y x2

#define a0 x3
#define a0short w3
#define a1 x4
#define b0 x5
#define b0short w5
#define b1 x6

#define u0 x7
#define u1 x8
#define u2 x9
#define u3 x10
#define u4 x11
#define u5 x12
#define u6 x13
#define u7 x14

#define u0short w7
#define u1short w8
#define u2short w9
#define u3short w10
#define u4short w11
#define u5short w12
#define u6short w13
#define u7short w14

#define t  x15

#define sgn x16
#define ysgn x17

// These are aliases to registers used elsewhere including input pointers.
// By the time they are used this does not conflict with other uses.

#define m0 y
#define m1 ysgn
#define m2 t
#define m3 x
#define u u2

// For the reduction stages, again aliasing other things but not the u's

#define c x3
#define cshort w3
#define h x4
#define l x5
#define lshort w5
#define d x6
#define q x17
#define qshort w17

S2N_BN_SYMBOL(bignum_mul_p25519):
        CFI_START

// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0]

        ldp     a0, a1, [x]
        ldp     b0, b1, [y]

        umull   u0, a0short, b0short
        lsr     x17, a0, #32
        umull   x15, w17, b0short
        lsr     x16, b0, #32
        umull   u1, w16, w17
        umull   x16, a0short, w16
        adds    u0, u0, x15, lsl #32
        lsr     x15, x15, #32
        adc     u1, u1, x15
        adds    u0, u0, x16, lsl #32
        lsr     x16, x16, #32
        adc     u1, u1, x16

        mul     u2, a1, b1
        umulh   u3, a1, b1

        subs    a1, a1, a0
        cneg    a1, a1, cc
        csetm   sgn, cc

        adds    u2, u2, u1
        adc     u3, u3, xzr

        subs    a0, b0, b1
        cneg    a0, a0, cc
        cinv    sgn, sgn, cc

        mul     t, a1, a0
        umulh   a0, a1, a0

        adds    u1, u0, u2
        adcs    u2, u2, u3
        adc     u3, u3, xzr

        adds    xzr, sgn, #1
        eor     t, t, sgn
        adcs    u1, t, u1
        eor     a0, a0, sgn
        adcs    u2, a0, u2
        adc     u3, u3, sgn

// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4]

        ldp     a0, a1, [x, #16]
        ldp     b0, b1, [y, #16]

        umull   u4, a0short, b0short
        lsr     x17, a0, #32
        umull   x15, w17, b0short
        lsr     x16, b0, #32
        umull   u5, w16, w17
        umull   x16, a0short, w16
        adds    u4, u4, x15, lsl #32
        lsr     x15, x15, #32
        adc     u5, u5, x15
        adds    u4, u4, x16, lsl #32
        lsr     x16, x16, #32
        adc     u5, u5, x16

        mul     u6, a1, b1
        umulh   u7, a1, b1

        subs    a1, a1, a0
        cneg    a1, a1, cc
        csetm   sgn, cc

        adds    u6, u6, u5
        adc     u7, u7, xzr

        subs    a0, b0, b1
        cneg    a0, a0, cc
        cinv    sgn, sgn, cc

        mul     t, a1, a0
        umulh   a0, a1, a0

        adds    u5, u4, u6
        adcs    u6, u6, u7
        adc     u7, u7, xzr

        adds    xzr, sgn, #1
        eor     t, t, sgn
        adcs    u5, t, u5
        eor     a0, a0, sgn
        adcs    u6, a0, u6
        adc     u7, u7, sgn

// Compute  sgn,[a1,a0] = x_hi - x_lo
// and     ysgn,[b1,b0] = y_lo - y_hi
// sign-magnitude differences

        ldp     a0, a1, [x, #16]
        ldp     t, sgn, [x]
        subs    a0, a0, t
        sbcs    a1, a1, sgn
        csetm   sgn, cc

        ldp     t, ysgn, [y]
        subs    b0, t, b0
        sbcs    b1, ysgn, b1
        csetm   ysgn, cc

        eor     a0, a0, sgn
        subs    a0, a0, sgn
        eor     a1, a1, sgn
        sbc     a1, a1, sgn

        eor     b0, b0, ysgn
        subs    b0, b0, ysgn
        eor     b1, b1, ysgn
        sbc     b1, b1, ysgn

// Save the correct sign for the sub-product

        eor     sgn, ysgn, sgn

// Add H' = H + L_top, still in [u7,u6,u5,u4]

        adds    u4, u4, u2
        adcs    u5, u5, u3
        adcs    u6, u6, xzr
        adc     u7, u7, xzr

// Now compute the mid-product as [m3,m2,m1,m0]

        mul     m0, a0, b0
        umulh   m1, a0, b0
        mul     m2, a1, b1
        umulh   m3, a1, b1

        subs    a1, a1, a0
        cneg    a1, a1, cc
        csetm   u, cc

        adds    m2, m2, m1
        adc     m3, m3, xzr

        subs    b1, b0, b1
        cneg    b1, b1, cc
        cinv    u, u, cc

        mul     b0, a1, b1
        umulh   b1, a1, b1

        adds    m1, m0, m2
        adcs    m2, m2, m3
        adc     m3, m3, xzr

        adds    xzr, u, #1
        eor     b0, b0, u
        adcs    m1, b0, m1
        eor     b1, b1, u
        adcs    m2, b1, m2
        adc     m3, m3, u

// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2]

        adds    u2, u4, u0
        adcs    u3, u5, u1
        adcs    u4, u6, u4
        adcs    u5, u7, u5
        adcs    u6, u6, xzr
        adc     u7, u7, xzr

// Add in the sign-adjusted complex term

        adds    xzr, sgn, #1
        eor     m0, m0, sgn
        adcs    u2, m0, u2
        eor     m1, m1, sgn
        adcs    u3, m1, u3
        eor     m2, m2, sgn
        adcs    u4, m2, u4
        eor     m3, m3, sgn
        adcs    u5, m3, u5
        adcs    u6, u6, sgn
        adc     u7, u7, sgn

// Now we have the full 8-digit product 2^256 * h + l where
// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
// and this is == 38 * h + l (mod p_25519).
// We do the 38 * h + l using 32-bit multiplies avoiding umulh,
// and pre-estimate and feed in the next-level quotient
// q = h + 1 where h = an early version of the high 255 bits.
// We add 2^255 * h - 19 * (h + 1), so end up offset by 2^255.

        mov     c, #38

        umull   h, u4short, cshort
        add     h, h, u0short, uxtw
        lsr     u0, u0, #32
        lsr     u4, u4, #32
        umaddl  u4, u4short, cshort, u0
        mov     u0, h

        umull   h, u5short, cshort
        add     h, h, u1short, uxtw
        lsr     u1, u1, #32
        lsr     u5, u5, #32
        umaddl  u5, u5short, cshort, u1
        mov     u1, h

        umull   h, u6short, cshort
        add     h, h, u2short, uxtw
        lsr     u2, u2, #32
        lsr     u6, u6, #32
        umaddl  u6, u6short, cshort, u2
        mov     u2, h

        umull   h, u7short, cshort
        add     h, h, u3short, uxtw
        lsr     u3, u3, #32
        lsr     u7, u7, #32
        umaddl  u7, u7short, cshort, u3
        mov     u3, h

        lsr     q, u7, #31

        mov     l, #19
        umaddl  l, lshort, qshort, l
        add     u0, u0, l

        adds    u0, u0, u4, lsl #32
        extr    c, u5, u4, #32
        adcs    u1, u1, c
        extr    c, u6, u5, #32
        adcs    u2, u2, c
        extr    c, u7, u6, #32
        lsl     l, q, #63
        eor     u3, u3, l
        adc     u3, u3, c

// Now we correct by a final 2^255-19 if the top bit is clear
// meaning that the "real" pre-reduced result is negative.

        mov     c, #19
        tst     u3, #0x8000000000000000
        csel    c, c, xzr, pl
        subs    u0, u0, c
        sbcs    u1, u1, xzr
        sbcs    u2, u2, xzr
        sbc     u3, u3, xzr
        and     u3, u3, #~0x8000000000000000

// Write back result

        stp     u0, u1, [x0]
        stp     u2, u3, [x0, #16]

        CFI_RET

S2N_BN_SIZE_DIRECTIVE(bignum_mul_p25519)

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif