// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

// ----------------------------------------------------------------------------
// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
// Inputs x[4], y[4], c[4]; output z[4]
//
//    extern void bignum_madd_n25519(uint64_t z[static 4], const uint64_t x[static 4],
//                                   const uint64_t y[static 4],
//                                   const uint64_t c[static 4]);
//
// Performs z := (x * y + c) mod n_25519, where the modulus is
// n_25519 = 2^252 + 27742317777372353535851937790883648493, the
// order of the curve25519/edwards25519 basepoint. The result z
// and the inputs x, y and c are all 4 digits (256 bits).
//
// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = c
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd_n25519)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_madd_n25519)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd_n25519)
        .text
        .balign 4

// Backup of the input pointer so we can modify x0

#define z x19

// Temporaries for reduction phase

#define q   x2
#define n0  x3
#define n1  x4
#define t0  x5
#define t1  x6
#define t2  x7

// Loading large constants

#define movbig(nn,n3,n2,n1,n0)                                      \
        movz    nn, n0 __LF                                            \
        movk    nn, n1, lsl #16 __LF                                   \
        movk    nn, n2, lsl #32 __LF                                   \
        movk    nn, n3, lsl #48

// Single round of modular reduction mod_n25519, mapping
// [m4;m3;m2;m1;m0] = m to [m3;m2;m1;m0] = m mod n_25519,
// *assuming* the input m < 2^64 * n_25519. This is very
// close to the loop body of the bignum_mod_n25519 function.

#define reduce(m4,m3,m2,m1,m0)                          \
        extr    q, m4, m3, #60 __LF                        \
        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
        sub     q, q, m4, lsr #60 __LF                     \
        and     t0, m4, #0xF000000000000000 __LF           \
        add     m3, m3, t0 __LF                            \
        mul     t0, n0, q __LF                             \
        mul     t1, n1, q __LF                             \
        umulh   t2, n0, q __LF                             \
        adds    t1, t1, t2 __LF                            \
        umulh   t2, n1, q __LF                             \
        adc     t2, t2, xzr __LF                           \
        subs    m0, m0, t0 __LF                            \
        sbcs    m1, m1, t1 __LF                            \
        sbcs    m2, m2, t2 __LF                            \
        sbcs    m3, m3, xzr __LF                           \
        csel    t0, n0, xzr, cc __LF                       \
        csel    t1, n1, xzr, cc __LF                       \
        adds    m0, m0, t0 __LF                            \
        and     t2, t0, #0x1000000000000000 __LF           \
        adcs    m1, m1, t1 __LF                            \
        adcs    m2, m2, xzr __LF                           \
        adc     m3, m3, t2

// Special case of "reduce" with m4 = 0. As well as not using m4,
// the quotient selection is slightly simpler, just floor(m/2^252)
// versus min (floor(m/2^252)) (2^63-1).

#define reduce0(m3,m2,m1,m0)                            \
        lsr     q, m3, #60 __LF                            \
        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
        mul     t0, n0, q __LF                             \
        mul     t1, n1, q __LF                             \
        umulh   t2, n0, q __LF                             \
        adds    t1, t1, t2 __LF                            \
        umulh   t2, n1, q __LF                             \
        adc     t2, t2, xzr __LF                           \
        subs    m0, m0, t0 __LF                            \
        sbcs    m1, m1, t1 __LF                            \
        sbcs    m2, m2, t2 __LF                            \
        sbcs    m3, m3, xzr __LF                           \
        csel    t0, n0, xzr, cc __LF                       \
        csel    t1, n1, xzr, cc __LF                       \
        adds    m0, m0, t0 __LF                            \
        and     t2, t0, #0x1000000000000000 __LF           \
        adcs    m1, m1, t1 __LF                            \
        adcs    m2, m2, xzr __LF                           \
        adc     m3, m3, t2

S2N_BN_SYMBOL(bignum_madd_n25519):
        CFI_START

        CFI_PUSH2(x19,x20)

// Back up the result pointer so we can overwrite x0 in intermediate steps

        mov     z, x0

// First compute [x15;x14;x13;x12;x11;x10;x9;x8] = x * y. This is
// a basic 2-level Karatsuba multiplier, similar to the start of
// bignum_mul_p25519, but with changes to the register allocation,
// which in particular preserve x3/w3 for the next step.

        ldp     x0, x4, [x1]
        ldp     x5, x6, [x2]
        umull   x8, w0, w5
        lsr     x17, x0, #32
        umull   x7, w17, w5
        lsr     x16, x5, #32
        umull   x9, w16, w17
        umull   x16, w0, w16
        adds    x8, x8, x7, lsl #32
        lsr     x7, x7, #32
        adc     x9, x9, x7
        adds    x8, x8, x16, lsl #32
        lsr     x16, x16, #32
        adc     x9, x9, x16
        mul     x10, x4, x6
        umulh   x11, x4, x6
        subs    x4, x4, x0
        cneg    x4, x4, cc
        csetm   x16, cc
        adds    x10, x10, x9
        adc     x11, x11, xzr
        subs    x0, x5, x6
        cneg    x0, x0, cc
        cinv    x16, x16, cc
        mul     x7, x4, x0
        umulh   x0, x4, x0
        adds    x9, x8, x10
        adcs    x10, x10, x11
        adc     x11, x11, xzr
        cmn     x16, #0x1
        eor     x7, x7, x16
        adcs    x9, x7, x9
        eor     x0, x0, x16
        adcs    x10, x0, x10
        adc     x11, x11, x16
        ldp     x0, x4, [x1, #16]
        ldp     x5, x6, [x2, #16]
        umull   x12, w0, w5
        lsr     x17, x0, #32
        umull   x7, w17, w5
        lsr     x16, x5, #32
        umull   x13, w16, w17
        umull   x16, w0, w16
        adds    x12, x12, x7, lsl #32
        lsr     x7, x7, #32
        adc     x13, x13, x7
        adds    x12, x12, x16, lsl #32
        lsr     x16, x16, #32
        adc     x13, x13, x16
        mul     x14, x4, x6
        umulh   x15, x4, x6
        subs    x4, x4, x0
        cneg    x4, x4, cc
        csetm   x16, cc
        adds    x14, x14, x13
        adc     x15, x15, xzr
        subs    x0, x5, x6
        cneg    x0, x0, cc
        cinv    x16, x16, cc
        mul     x7, x4, x0
        umulh   x0, x4, x0
        adds    x13, x12, x14
        adcs    x14, x14, x15
        adc     x15, x15, xzr
        cmn     x16, #0x1
        eor     x7, x7, x16
        adcs    x13, x7, x13
        eor     x0, x0, x16
        adcs    x14, x0, x14
        adc     x15, x15, x16
        ldp     x0, x4, [x1, #16]
        ldp     x7, x16, [x1]
        subs    x0, x0, x7
        sbcs    x4, x4, x16
        csetm   x16, cc
        ldp     x7, x17, [x2]
        subs    x5, x7, x5
        sbcs    x6, x17, x6
        csetm   x17, cc
        eor     x0, x0, x16
        subs    x0, x0, x16
        eor     x4, x4, x16
        sbc     x4, x4, x16
        eor     x5, x5, x17
        subs    x5, x5, x17
        eor     x6, x6, x17
        sbc     x6, x6, x17
        eor     x16, x17, x16
        adds    x12, x12, x10
        adcs    x13, x13, x11
        adcs    x14, x14, xzr
        adc     x15, x15, xzr
        mul     x2, x0, x5
        umulh   x17, x0, x5
        mul     x7, x4, x6
        umulh   x1, x4, x6
        subs    x4, x4, x0
        cneg    x4, x4, cc
        csetm   x10, cc
        adds    x7, x7, x17
        adc     x1, x1, xzr
        subs    x6, x5, x6
        cneg    x6, x6, cc
        cinv    x10, x10, cc
        mul     x5, x4, x6
        umulh   x6, x4, x6
        adds    x17, x2, x7
        adcs    x7, x7, x1
        adc     x1, x1, xzr
        cmn     x10, #0x1
        eor     x5, x5, x10
        adcs    x17, x5, x17
        eor     x6, x6, x10
        adcs    x7, x6, x7
        adc     x1, x1, x10
        adds    x10, x12, x8
        adcs    x11, x13, x9
        adcs    x12, x14, x12
        adcs    x13, x15, x13
        adcs    x14, x14, xzr
        adc     x15, x15, xzr
        cmn     x16, #0x1
        eor     x2, x2, x16
        adcs    x10, x2, x10
        eor     x17, x17, x16
        adcs    x11, x17, x11
        eor     x7, x7, x16
        adcs    x12, x7, x12
        eor     x1, x1, x16
        adcs    x13, x1, x13
        adcs    x14, x14, x16
        adc     x15, x15, x16

// Add the constant term, so [x15;x14;x13;x12;x11;x10;x9;x8] = x * y + c
// It's easier to just do this now versus incorporating it into the
// Karatsuba steps above or deferring it until partway through the
// reduction, though it does result in a long carry propagation here.

        ldp     x0, x1, [x3]
        adds    x8, x8, x0
        adcs    x9, x9, x1
        ldp     x0, x1, [x3, #16]
        adcs    x10, x10, x0
        adcs    x11, x11, x1
        adcs    x12, x12, xzr
        adcs    x13, x13, xzr
        adcs    x14, x14, xzr
        adc     x15, x15, xzr

// Now do the modular reduction and write back

        movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed)
        movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6)

        reduce0(x15,x14,x13,x12)
        reduce(x15,x14,x13,x12,x11)
        reduce(x14,x13,x12,x11,x10)
        reduce(x13,x12,x11,x10,x9)
        reduce(x12,x11,x10,x9,x8)

        stp     x8, x9, [z]
        stp     x10, x11, [z, #16]

// Restore registers and return

        CFI_POP2(x19,x20)
        CFI_RET

S2N_BN_SIZE_DIRECTIVE(bignum_madd_n25519)

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif