// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

// ----------------------------------------------------------------------------
// Square modulo p_25519, z := (x^2) mod p_25519
// Input x[4]; output z[4]
//
//    extern void bignum_sqr_p25519(uint64_t z[static 4], const uint64_t x[static 4]);
//
// Standard ARM ABI: X0 = z, X1 = x
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_sqr_p25519)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519)
        .text
        .balign 4

#define z x0
#define x x1

// Variables

#define u0 x2
#define u1 x3
#define u2 x4
#define u3 x5
#define u4 x6
#define u5 x7
#define u6 x8
#define u7 x9

#define u0short w2
#define u1short w3
#define u2short w4
#define u3short w5
#define u4short w6
#define u5short w7
#define u6short w8
#define u7short w9

#define c x10
#define cshort w10
#define l x11
#define lshort w11
#define h x12
#define hshort w12
#define q x13
#define qshort w13

#define t1 x14
#define t1short w14
#define t2 x15
#define t2short w15
#define t3 x16
#define t3short w16

S2N_BN_SYMBOL(bignum_sqr_p25519):
        CFI_START

// First just a near-clone of bignum_sqr_4_8 to get the square, using
// different registers to collect full product without writeback.

        ldp     c, l, [x]
        ldp     h, q, [x, #16]
        umull   u0, cshort, cshort
        lsr     t1, c, #32
        umull   u1, t1short, t1short
        umull   t1, cshort, t1short
        adds    u0, u0, t1, lsl #33
        lsr     t1, t1, #31
        adc     u1, u1, t1
        umull   u2, lshort, lshort
        lsr     t1, l, #32
        umull   u3, t1short, t1short
        umull   t1, lshort, t1short
        mul     t2, c, l
        umulh   t3, c, l
        adds    u2, u2, t1, lsl #33
        lsr     t1, t1, #31
        adc     u3, u3, t1
        adds    t2, t2, t2
        adcs    t3, t3, t3
        adc     u3, u3, xzr
        adds    u1, u1, t2
        adcs    u2, u2, t3
        adc     u3, u3, xzr
        umull   u4, hshort, hshort
        lsr     t1, h, #32
        umull   u5, t1short, t1short
        umull   t1, hshort, t1short
        adds    u4, u4, t1, lsl #33
        lsr     t1, t1, #31
        adc     u5, u5, t1
        umull   u6, qshort, qshort
        lsr     t1, q, #32
        umull   u7, t1short, t1short
        umull   t1, qshort, t1short
        mul     t2, h, q
        umulh   t3, h, q
        adds    u6, u6, t1, lsl #33
        lsr     t1, t1, #31
        adc     u7, u7, t1
        adds    t2, t2, t2
        adcs    t3, t3, t3
        adc     u7, u7, xzr
        adds    u5, u5, t2
        adcs    u6, u6, t3
        adc     u7, u7, xzr
        subs    c, c, h
        sbcs    l, l, q
        csetm   t3, cc
        eor     c, c, t3
        subs    c, c, t3
        eor     l, l, t3
        sbc     l, l, t3
        adds    u4, u4, u2
        adcs    u5, u5, u3
        adcs    u6, u6, xzr
        adc     u7, u7, xzr
        umull   h, cshort, cshort
        lsr     u3, c, #32
        umull   q, u3short, u3short
        umull   u3, cshort, u3short
        adds    h, h, u3, lsl #33
        lsr     u3, u3, #31
        adc     q, q, u3
        umull   t2, lshort, lshort
        lsr     u3, l, #32
        umull   t1, u3short, u3short
        umull   u3, lshort, u3short
        mul     u2, c, l
        umulh   t3, c, l
        adds    t2, t2, u3, lsl #33
        lsr     u3, u3, #31
        adc     t1, t1, u3
        adds    u2, u2, u2
        adcs    t3, t3, t3
        adc     t1, t1, xzr
        adds    q, q, u2
        adcs    t2, t2, t3
        adc     t1, t1, xzr
        adds    u2, u0, u4
        adcs    u3, u1, u5
        adcs    u4, u4, u6
        adcs    u5, u5, u7
        csetm   t3, cc
        subs    u2, u2, h
        sbcs    u3, u3, q
        sbcs    u4, u4, t2
        sbcs    u5, u5, t1
        adcs    u6, u6, t3
        adc     u7, u7, t3

// Now we have the full 8-digit product 2^256 * h + l where
// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
// and this is == 38 * h + l (mod p_25519).
// We do the 38 * h + l using 32-bit multiplies avoiding umulh,
// and pre-estimate and feed in the next-level quotient
// q = h + 1 where h = an early version of the high 255 bits.
// We add 2^255 * h - 19 * (h + 1), so end up offset by 2^255.

        mov     c, #38

        umull   h, u4short, cshort
        add     h, h, u0short, uxtw
        lsr     u0, u0, #32
        lsr     u4, u4, #32
        umaddl  u4, u4short, cshort, u0
        mov     u0, h

        umull   h, u5short, cshort
        add     h, h, u1short, uxtw
        lsr     u1, u1, #32
        lsr     u5, u5, #32
        umaddl  u5, u5short, cshort, u1
        mov     u1, h

        umull   h, u6short, cshort
        add     h, h, u2short, uxtw
        lsr     u2, u2, #32
        lsr     u6, u6, #32
        umaddl  u6, u6short, cshort, u2
        mov     u2, h

        umull   h, u7short, cshort
        add     h, h, u3short, uxtw
        lsr     u3, u3, #32
        lsr     u7, u7, #32
        umaddl  u7, u7short, cshort, u3
        mov     u3, h

        lsr     q, u7, #31

        mov     l, #19
        umaddl  l, lshort, qshort, l
        add     u0, u0, l

        adds    u0, u0, u4, lsl #32
        extr    c, u5, u4, #32
        adcs    u1, u1, c
        extr    c, u6, u5, #32
        adcs    u2, u2, c
        extr    c, u7, u6, #32
        lsl     l, q, #63
        eor     u3, u3, l
        adc     u3, u3, c

// Now we correct by a final 2^255-19 if the top bit is clear
// meaning that the "real" pre-reduced result is negative.

        mov     c, #19
        tst     u3, #0x8000000000000000
        csel    c, c, xzr, pl
        subs    u0, u0, c
        sbcs    u1, u1, xzr
        sbcs    u2, u2, xzr
        sbc     u3, u3, xzr
        and     u3, u3, #~0x8000000000000000

// Write back result

        stp     u0, u1, [x0]
        stp     u2, u3, [x0, #16]

        CFI_RET

S2N_BN_SIZE_DIRECTIVE(bignum_sqr_p25519)

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif