// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

// ----------------------------------------------------------------------------
// Square root modulo p_25519 = 2^255 - 19
// Input x[4]; output function return (Legendre symbol) and z[4]
//
// extern int64_t bignum_sqrt_p25519_alt(uint64_t z[static 4],
//                                       const uint64_t x[static 4]);
//
// Given a 4-digit input x, returns a modular square root mod p_25519, i.e.
// a z such that z^2 == x (mod p_25519), whenever one exists. The square
// root z is chosen so that its LSB is even (note that p_25519 - z is
// another square root). The function return is the Legendre/Jacobi symbol
// (x//p_25519), which indicates whether indeed x has a modular square root
// and hence whether the result is meaningful:
//
//   0: x is divisible by p_25519 and z is the square root 0
//  +1: x is coprime to p_25519 and z is a square root
//  -1: x is coprime to p_25519 but not a quadratic residue
//
// Standard ARM ABI: X0 = z, X1 = x
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519_alt)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_sqrt_p25519_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519_alt)


        .text
        .balign 4

// Size in bytes of a 64-bit word

#define N 8

// Pointer-offset pairs for temporaries on stack

#define a sp, #0
#define b sp, #(4*N)
#define s sp, #(8*N)
#define t sp, #(12*N)

// Other temporary variables in register

#define res x19

// Total size to reserve on the stack

#define NSPACE 16*N

// Loading large constants

#define movbig(nn,n3,n2,n1,n0)                                      \
        movz    nn, n0 __LF                                            \
        movk    nn, n1, lsl #16 __LF                                   \
        movk    nn, n2, lsl #32 __LF                                   \
        movk    nn, n3, lsl #48

// Macros wrapping up calls to the local subroutines

#define mulp(dest,src1,src2)                                            \
        add     x0, dest __LF                                              \
        add     x1, src1 __LF                                              \
        add     x2, src2 __LF                                              \
        CFI_BL(Lbignum_sqrt_p25519_alt_mul_p25519)

#define nsqr(dest,n,src)                                                \
        add     x0, dest __LF                                              \
        mov     x1, n __LF                                                 \
        add     x2, src __LF                                               \
        CFI_BL(Lbignum_sqrt_p25519_alt_nsqr_p25519)

S2N_BN_SYMBOL(bignum_sqrt_p25519_alt):
        CFI_START

// Save registers and make room for temporaries

        CFI_PUSH2(x19,x30)
        CFI_DEC_SP(NSPACE)

// Save the return pointer for the end so we can overwrite x0 later

        mov     res, x0

// Set up reduced version of the input argument a = x mod p_25519. Then
// get the candidate square root s = a^{252-2}

        ldp     x2, x3, [x1]
        ldp     x4, x5, [x1, #16]
        mov     x7, #19
        lsr     x6, x5, #63
        madd    x6, x7, x6, x7
        adds    x2, x2, x6
        adcs    x3, x3, xzr
        adcs    x4, x4, xzr
        orr     x5, x5, #0x8000000000000000
        adcs    x5, x5, xzr
        csel    x6, x7, xzr, lo
        subs    x2, x2, x6
        sbcs    x3, x3, xzr
        sbcs    x4, x4, xzr
        sbc     x5, x5, xzr
        and     x5, x5, #0x7fffffffffffffff
        stp     x2, x3, [a]
        stp     x4, x5, [a+16]

  // Power 2^2 - 1 = 3

        nsqr(t,1,a)
        mulp(t,t,a)

  // Power 2^4 - 1 = 15

        nsqr(s,2,t)
        mulp(t,s,t)

  // Power 2^5 - 1 = 31

        nsqr(s,1,t)
        mulp(b,s,a)

  // Power 2^10 - 1

        nsqr(s,5,b)
        mulp(t,s,b)

  // Power 2^20 - 1

        nsqr(s,10,t)
        mulp(t,s,t)

  // Power 2^25 - 1

        nsqr(s,5,t)
        mulp(b,s,b)

  // Power 2^50 - 1

        nsqr(s,25,b)
        mulp(t,s,b)

  // Power 2^100 - 1
        nsqr(s,50,t)
        mulp(t,s,t)

  // Power 2^125 - 1

        nsqr(s,25,t)
        mulp(b,s,b)

  // Power 2^250 - 1

        nsqr(s,125,b)
        mulp(b,s,b)

  // Power 2^251 - 1

        nsqr(s,1,b)
        mulp(t,s,a)

  // Power 2^252 - 2

        nsqr(s,1,t)

// s is now one candidate square root. Generate the other one t = s * j_25519

        movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0)
        movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478)
        movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7)
        movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b)
        stp     x0, x1, [t]
        stp     x2, x3, [t+16]
        mulp(t,s,t)

// Now multiplex between them according to whether s^2 = a

        nsqr(b,1,s)

        ldp     x10, x11, [a]
        ldp     x14, x15, [b]
        eor     x10, x10, x14
        eor     x11, x11, x15
        orr     x10, x10, x11
        ldp     x12, x13, [a+16]
        ldp     x16, x17, [b+16]
        eor     x12, x12, x16
        eor     x13, x13, x17
        orr     x12, x12, x13
        orr     x10, x10, x12
        cmp     x10, xzr

        ldp     x10, x11, [s]
        ldp     x14, x15, [t]
        csel    x10, x10, x14, eq
        csel    x11, x11, x15, eq
        ldp     x12, x13, [s+16]
        ldp     x16, x17, [t+16]
        csel    x12, x12, x16, eq
        csel    x13, x13, x17, eq

// For definiteness, choose "positive" (LSB=0) square root

        mov     x14, #-19
        subs    x14, x14, x10
        mov     x16, #-1
        sbcs    x15, x16, x11
        mov     x17, #0x7FFFFFFFFFFFFFFF
        sbcs    x16, x16, x12
        sbc     x17, x17, x13

        tst     x10, #1
        csel    x10, x10, x14, eq
        csel    x11, x11, x15, eq
        csel    x12, x12, x16, eq
        csel    x13, x13, x17, eq

        mov     x2, res
        stp     x10, x11, [x2]
        stp     x12, x13, [x2, #16]

// Determine if it is is indeed a square root and also if a = 0
// Hence return the Legendre-Jacobi symbol as required.

        add     x0, b
        mov     x1, #1
        CFI_BL(Lbignum_sqrt_p25519_alt_nsqr_p25519)

        ldp     x10, x11, [a]
        ldp     x14, x15, [b]
        eor     x14, x10, x14
        eor     x15, x11, x15
        orr     x14, x14, x15
        ldp     x12, x13, [a+16]
        ldp     x16, x17, [b+16]
        eor     x16, x12, x16
        eor     x17, x13, x17
        orr     x16, x16, x17
        orr     x14, x14, x16
        cmp     x14, xzr
        mov     x0, #1
        cneg    x0, x0, ne

        orr     x10, x10, x11
        orr     x12, x12, x13
        orr     x10, x10, x12
        cmp     x10, xzr
        csel    x0, x0, xzr, ne

// Restore stack and registers

        CFI_INC_SP(NSPACE)
        CFI_POP2(x19,x30)
        CFI_RET

S2N_BN_SIZE_DIRECTIVE(bignum_sqrt_p25519_alt)

// *************************************************************
// Local z = x * y
// *************************************************************

S2N_BN_FUNCTION_TYPE_DIRECTIVE(Lbignum_sqrt_p25519_alt_mul_p25519)

Lbignum_sqrt_p25519_alt_mul_p25519:
        CFI_START
        ldp     x3, x4, [x1]
        ldp     x7, x8, [x2]
        mul     x12, x3, x7
        umulh   x13, x3, x7
        mul     x11, x3, x8
        umulh   x14, x3, x8
        adds    x13, x13, x11
        ldp     x9, x10, [x2, #16]
        mul     x11, x3, x9
        umulh   x15, x3, x9
        adcs    x14, x14, x11
        mul     x11, x3, x10
        umulh   x16, x3, x10
        adcs    x15, x15, x11
        adc     x16, x16, xzr
        ldp     x5, x6, [x1, #16]
        mul     x11, x4, x7
        adds    x13, x13, x11
        mul     x11, x4, x8
        adcs    x14, x14, x11
        mul     x11, x4, x9
        adcs    x15, x15, x11
        mul     x11, x4, x10
        adcs    x16, x16, x11
        umulh   x3, x4, x10
        adc     x3, x3, xzr
        umulh   x11, x4, x7
        adds    x14, x14, x11
        umulh   x11, x4, x8
        adcs    x15, x15, x11
        umulh   x11, x4, x9
        adcs    x16, x16, x11
        adc     x3, x3, xzr
        mul     x11, x5, x7
        adds    x14, x14, x11
        mul     x11, x5, x8
        adcs    x15, x15, x11
        mul     x11, x5, x9
        adcs    x16, x16, x11
        mul     x11, x5, x10
        adcs    x3, x3, x11
        umulh   x4, x5, x10
        adc     x4, x4, xzr
        umulh   x11, x5, x7
        adds    x15, x15, x11
        umulh   x11, x5, x8
        adcs    x16, x16, x11
        umulh   x11, x5, x9
        adcs    x3, x3, x11
        adc     x4, x4, xzr
        mul     x11, x6, x7
        adds    x15, x15, x11
        mul     x11, x6, x8
        adcs    x16, x16, x11
        mul     x11, x6, x9
        adcs    x3, x3, x11
        mul     x11, x6, x10
        adcs    x4, x4, x11
        umulh   x5, x6, x10
        adc     x5, x5, xzr
        umulh   x11, x6, x7
        adds    x16, x16, x11
        umulh   x11, x6, x8
        adcs    x3, x3, x11
        umulh   x11, x6, x9
        adcs    x4, x4, x11
        adc     x5, x5, xzr
        mov     x7, #38
        mul     x11, x7, x16
        umulh   x9, x7, x16
        adds    x12, x12, x11
        mul     x11, x7, x3
        umulh   x3, x7, x3
        adcs    x13, x13, x11
        mul     x11, x7, x4
        umulh   x4, x7, x4
        adcs    x14, x14, x11
        mul     x11, x7, x5
        umulh   x5, x7, x5
        adcs    x15, x15, x11
        cset    x16, hs
        adds    x15, x15, x4
        adc     x16, x16, x5
        cmn     x15, x15
        orr     x15, x15, #0x8000000000000000
        adc     x8, x16, x16
        mov     x7, #19
        madd    x11, x7, x8, x7
        adds    x12, x12, x11
        adcs    x13, x13, x9
        adcs    x14, x14, x3
        adcs    x15, x15, xzr
        csel    x7, x7, xzr, lo
        subs    x12, x12, x7
        sbcs    x13, x13, xzr
        sbcs    x14, x14, xzr
        sbc     x15, x15, xzr
        and     x15, x15, #0x7fffffffffffffff
        stp     x12, x13, [x0]
        stp     x14, x15, [x0, #16]
        CFI_RET

S2N_BN_SIZE_DIRECTIVE(Lbignum_sqrt_p25519_alt_mul_p25519)

// *************************************************************
// Local z = 2^n * x
// *************************************************************

S2N_BN_FUNCTION_TYPE_DIRECTIVE(Lbignum_sqrt_p25519_alt_nsqr_p25519)

Lbignum_sqrt_p25519_alt_nsqr_p25519:
        CFI_START

// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20

        ldp     x6, x3, [x2]
        ldp     x4, x5, [x2, #16]
        mov     x2, x6

// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and
// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38

Lbignum_sqrt_p25519_alt_loop:
        mul     x9, x2, x3
        umulh   x10, x2, x3
        mul     x11, x2, x5
        umulh   x12, x2, x5
        mul     x7, x2, x4
        umulh   x6, x2, x4
        adds    x10, x10, x7
        adcs    x11, x11, x6
        mul     x7, x3, x4
        umulh   x6, x3, x4
        adc     x6, x6, xzr
        adds    x11, x11, x7
        mul     x13, x4, x5
        umulh   x14, x4, x5
        adcs    x12, x12, x6
        mul     x7, x3, x5
        umulh   x6, x3, x5
        adc     x6, x6, xzr
        adds    x12, x12, x7
        adcs    x13, x13, x6
        adc     x14, x14, xzr
        adds    x9, x9, x9
        adcs    x10, x10, x10
        adcs    x11, x11, x11
        adcs    x12, x12, x12
        adcs    x13, x13, x13
        adcs    x14, x14, x14
        cset    x6, hs
        umulh   x7, x2, x2
        mul     x8, x2, x2
        adds    x9, x9, x7
        mul     x7, x3, x3
        adcs    x10, x10, x7
        umulh   x7, x3, x3
        adcs    x11, x11, x7
        mul     x7, x4, x4
        adcs    x12, x12, x7
        umulh   x7, x4, x4
        adcs    x13, x13, x7
        mul     x7, x5, x5
        adcs    x14, x14, x7
        umulh   x7, x5, x5
        adc     x6, x6, x7
        mov     x3, #38
        mul     x7, x3, x12
        umulh   x4, x3, x12
        adds    x8, x8, x7
        mul     x7, x3, x13
        umulh   x13, x3, x13
        adcs    x9, x9, x7
        mul     x7, x3, x14
        umulh   x14, x3, x14
        adcs    x10, x10, x7
        mul     x7, x3, x6
        umulh   x6, x3, x6
        adcs    x11, x11, x7
        cset    x12, hs
        adds    x11, x11, x14
        adc     x12, x12, x6
        cmn     x11, x11
        bic     x11, x11, #0x8000000000000000
        adc     x2, x12, x12
        mov     x3, #0x13
        mul     x7, x3, x2
        adds    x2, x8, x7
        adcs    x3, x9, x4
        adcs    x4, x10, x13
        adc     x5, x11, xzr

// Loop as applicable

        subs    x1, x1, #1
        bne     Lbignum_sqrt_p25519_alt_loop

// We know the intermediate result x < 2^256 - 38, and now we do strict
// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
// which is equivalent to a "pl" condition.

        adds    x6, x2, #19
        adcs    x7, x3, xzr
        adcs    x8, x4, xzr
        adcs    x9, x5, xzr

        csel    x2, x2, x6, pl
        csel    x3, x3, x7, pl
        csel    x4, x4, x8, pl
        csel    x5, x5, x9, pl
        bic     x5, x5, #0x8000000000000000

// Copy result back into destination and return

        stp     x2, x3, [x0]
        stp     x4, x5, [x0, #16]
        CFI_RET

S2N_BN_SIZE_DIRECTIVE(Lbignum_sqrt_p25519_alt_nsqr_p25519)

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack, "", %progbits
#endif