// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

// ----------------------------------------------------------------------------
// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
// Inputs x[4], y[4], c[4]; output z[4]
//
//    extern void bignum_madd_n25519_alt(uint64_t z[static 4],
//                                       const uint64_t x[static 4],
//                                       const uint64_t y[static 4],
//                                       const uint64_t c[static 4]);
//
// Performs z := (x * y + c) mod n_25519, where the modulus is
// n_25519 = 2^252 + 27742317777372353535851937790883648493, the
// order of the curve25519/edwards25519 basepoint. The result z
// and the inputs x, y and c are all 4 digits (256 bits).
//
// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = c
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd_n25519_alt)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_madd_n25519_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd_n25519_alt)
        .text
        .balign 4

// Backup of the input pointer so we can modify x0

#define z x19

// Temporaries for reduction phase

#define q   x2
#define n0  x3
#define n1  x4
#define t0  x5
#define t1  x6
#define t2  x7

// Loading large constants

#define movbig(nn,n3,n2,n1,n0)                                      \
        movz    nn, n0 __LF                                            \
        movk    nn, n1, lsl #16 __LF                                   \
        movk    nn, n2, lsl #32 __LF                                   \
        movk    nn, n3, lsl #48

// Single round of modular reduction mod_n25519, mapping
// [m4;m3;m2;m1;m0] = m to [m3;m2;m1;m0] = m mod n_25519,
// *assuming* the input m < 2^64 * n_25519. This is very
// close to the loop body of the bignum_mod_n25519 function.

#define reduce(m4,m3,m2,m1,m0)                          \
        extr    q, m4, m3, #60 __LF                        \
        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
        sub     q, q, m4, lsr #60 __LF                     \
        and     t0, m4, #0xF000000000000000 __LF           \
        add     m3, m3, t0 __LF                            \
        mul     t0, n0, q __LF                             \
        mul     t1, n1, q __LF                             \
        umulh   t2, n0, q __LF                             \
        adds    t1, t1, t2 __LF                            \
        umulh   t2, n1, q __LF                             \
        adc     t2, t2, xzr __LF                           \
        subs    m0, m0, t0 __LF                            \
        sbcs    m1, m1, t1 __LF                            \
        sbcs    m2, m2, t2 __LF                            \
        sbcs    m3, m3, xzr __LF                           \
        csel    t0, n0, xzr, cc __LF                       \
        csel    t1, n1, xzr, cc __LF                       \
        adds    m0, m0, t0 __LF                            \
        and     t2, t0, #0x1000000000000000 __LF           \
        adcs    m1, m1, t1 __LF                            \
        adcs    m2, m2, xzr __LF                           \
        adc     m3, m3, t2

// Special case of "reduce" with m4 = 0. As well as not using m4,
// the quotient selection is slightly simpler, just floor(m/2^252)
// versus min (floor(m/2^252)) (2^63-1).

#define reduce0(m3,m2,m1,m0)                            \
        lsr     q, m3, #60 __LF                            \
        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
        mul     t0, n0, q __LF                             \
        mul     t1, n1, q __LF                             \
        umulh   t2, n0, q __LF                             \
        adds    t1, t1, t2 __LF                            \
        umulh   t2, n1, q __LF                             \
        adc     t2, t2, xzr __LF                           \
        subs    m0, m0, t0 __LF                            \
        sbcs    m1, m1, t1 __LF                            \
        sbcs    m2, m2, t2 __LF                            \
        sbcs    m3, m3, xzr __LF                           \
        csel    t0, n0, xzr, cc __LF                       \
        csel    t1, n1, xzr, cc __LF                       \
        adds    m0, m0, t0 __LF                            \
        and     t2, t0, #0x1000000000000000 __LF           \
        adcs    m1, m1, t1 __LF                            \
        adcs    m2, m2, xzr __LF                           \
        adc     m3, m3, t2

S2N_BN_SYMBOL(bignum_madd_n25519_alt):
        CFI_START

        CFI_PUSH2(x19,x20)

// Back up the result pointer so we can overwrite x0 in intermediate steps

        mov     z, x0

// First compute [x15;x14;x13;x12;x11;x10;x9;x8] = x * y + c. This
// is a basic schoolbook multiplier similar to the start of
// bignum_mul_p25519_alt except for different registers, but it
// also adds in the c term after the first row accumulation.

        ldp     x13, x14, [x1]
        ldp     x7, x0, [x2]
        mul     x8, x13, x7
        umulh   x9, x13, x7
        mul     x16, x13, x0
        umulh   x10, x13, x0
        adds    x9, x9, x16
        ldp     x4, x5, [x2, #16]
        mul     x16, x13, x4
        umulh   x11, x13, x4
        adcs    x10, x10, x16
        mul     x16, x13, x5
        umulh   x12, x13, x5
        adcs    x11, x11, x16
        adc     x12, x12, xzr
        ldp     x15, x6, [x3]
        adds    x8, x8, x15
        adcs    x9, x9, x6
        ldp     x15, x6, [x3, #16]
        adcs    x10, x10, x15
        adcs    x11, x11, x6
        adc     x12, x12, xzr
        ldp     x15, x6, [x1, #16]
        mul     x16, x14, x7
        adds    x9, x9, x16
        mul     x16, x14, x0
        adcs    x10, x10, x16
        mul     x16, x14, x4
        adcs    x11, x11, x16
        mul     x16, x14, x5
        adcs    x12, x12, x16
        umulh   x13, x14, x5
        adc     x13, x13, xzr
        umulh   x16, x14, x7
        adds    x10, x10, x16
        umulh   x16, x14, x0
        adcs    x11, x11, x16
        umulh   x16, x14, x4
        adcs    x12, x12, x16
        adc     x13, x13, xzr
        mul     x16, x15, x7
        adds    x10, x10, x16
        mul     x16, x15, x0
        adcs    x11, x11, x16
        mul     x16, x15, x4
        adcs    x12, x12, x16
        mul     x16, x15, x5
        adcs    x13, x13, x16
        umulh   x14, x15, x5
        adc     x14, x14, xzr
        umulh   x16, x15, x7
        adds    x11, x11, x16
        umulh   x16, x15, x0
        adcs    x12, x12, x16
        umulh   x16, x15, x4
        adcs    x13, x13, x16
        adc     x14, x14, xzr
        mul     x16, x6, x7
        adds    x11, x11, x16
        mul     x16, x6, x0
        adcs    x12, x12, x16
        mul     x16, x6, x4
        adcs    x13, x13, x16
        mul     x16, x6, x5
        adcs    x14, x14, x16
        umulh   x15, x6, x5
        adc     x15, x15, xzr
        umulh   x16, x6, x7
        adds    x12, x12, x16
        umulh   x16, x6, x0
        adcs    x13, x13, x16
        umulh   x16, x6, x4
        adcs    x14, x14, x16
        adc     x15, x15, xzr

// Now do the modular reduction and write back

        movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed)
        movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6)

        reduce0(x15,x14,x13,x12)
        reduce(x15,x14,x13,x12,x11)
        reduce(x14,x13,x12,x11,x10)
        reduce(x13,x12,x11,x10,x9)
        reduce(x12,x11,x10,x9,x8)

        stp     x8, x9, [z]
        stp     x10, x11, [z, #16]

// Restore registers and return

        CFI_POP2(x19,x20)
        CFI_RET

S2N_BN_SIZE_DIRECTIVE(bignum_madd_n25519_alt)

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif