// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Inverse square root modulo p_25519 = 2^255 - 19 // Input x[4]; output function return (Legendre symbol) and z[4] // // extern int64_t bignum_invsqrt_p25519(uint64_t z[static 4],const uint64_t x[static 4]); // // Given a 4-digit input x, returns a modular inverse square root mod p_25519, // i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The // inverse square root z is chosen so that its LSB is even (note that p_25519-z // is another possibility). The function return is the Legendre/Jacobi symbol // (x//p_25519), which indicates whether indeed x has a modular inverse square // root and hence whether the result is meaningful: // // 0: x is divisible by p_25519 so trivially there is no inverse square root // +1: x is coprime to p_25519 and z is indeed an inverse square root // -1: x is coprime to p_25519 but there is no (inverse or direct) square root // // Standard ARM ABI: X0 = z, X1 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum_arm.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519) S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_invsqrt_p25519) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519) .text .balign 4 // Size in bytes of a 64-bit word #define N 8 // Pointer-offset pairs for temporaries on stack #define a sp, #0 #define b sp, #(4*N) #define s sp, #(8*N) #define t sp, #(12*N) // Other temporary variables in register #define res x19 // Total size to reserve on the stack #define NSPACE 16*N // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ movz nn, n0 __LF \ movk nn, n1, lsl #16 __LF \ movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macros wrapping up calls to the local subroutines #define mulp(dest,src1,src2) \ add x0, dest __LF \ add x1, src1 __LF \ add x2, src2 __LF \ CFI_BL(Lbignum_invsqrt_p25519_mul_p25519) #define nsqr(dest,n,src) \ add x0, dest __LF \ mov x1, n __LF \ add x2, src __LF \ CFI_BL(Lbignum_invsqrt_p25519_nsqr_p25519) S2N_BN_SYMBOL(bignum_invsqrt_p25519): CFI_START // Save registers and make room for temporaries CFI_PUSH2(x19,x30) CFI_DEC_SP(NSPACE) // Save the return pointer for the end so we can overwrite x0 later mov res, x0 // Set up reduced version of the input argument a = x mod p_25519. Then // get the candidate inverse square root s = a^{252-3} ldp x2, x3, [x1] ldp x4, x5, [x1, #16] mov x7, #19 lsr x6, x5, #63 madd x6, x7, x6, x7 adds x2, x2, x6 adcs x3, x3, xzr adcs x4, x4, xzr orr x5, x5, #0x8000000000000000 adcs x5, x5, xzr csel x6, x7, xzr, lo subs x2, x2, x6 sbcs x3, x3, xzr sbcs x4, x4, xzr sbc x5, x5, xzr and x5, x5, #0x7fffffffffffffff stp x2, x3, [a] stp x4, x5, [a+16] // Power 2^2 - 1 = 3 nsqr(t,1,a) mulp(t,t,a) // Power 2^4 - 1 = 15 nsqr(s,2,t) mulp(t,s,t) // Power 2^5 - 1 = 31 nsqr(s,1,t) mulp(b,s,a) // Power 2^10 - 1 nsqr(s,5,b) mulp(t,s,b) // Power 2^20 - 1 nsqr(s,10,t) mulp(t,s,t) // Power 2^25 - 1 nsqr(s,5,t) mulp(b,s,b) // Power 2^50 - 1 nsqr(s,25,b) mulp(t,s,b) // Power 2^100 - 1 nsqr(s,50,t) mulp(t,s,t) // Power 2^125 - 1 nsqr(s,25,t) mulp(b,s,b) // Power 2^250 - 1 nsqr(s,125,b) mulp(b,s,b) // Power 2^252 - 3 nsqr(s,2,b) mulp(s,s,a) // s = a^{2^252-3} is now one candidate inverse square root. // Generate the other one t = s * j_25519 where j_25519 = sqrt(-1) movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) stp x0, x1, [t] stp x2, x3, [t+16] mulp(t,s,t) // Now multiplex between them according to whether a * s^2 = 1 nsqr(b,1,s) mulp(b,a,b) ldp x10, x11, [b] eor x10, x10, #1 ldp x12, x13, [b+16] orr x10, x10, x11 orr x12, x12, x13 orr x10, x10, x12 cmp x10, xzr ldp x10, x11, [s] ldp x14, x15, [t] csel x10, x10, x14, eq csel x11, x11, x15, eq ldp x12, x13, [s+16] ldp x16, x17, [t+16] csel x12, x12, x16, eq csel x13, x13, x17, eq // For definiteness, choose "positive" (LSB=0) inverse square root mov x14, #-19 subs x14, x14, x10 mov x16, #-1 sbcs x15, x16, x11 mov x17, #0x7FFFFFFFFFFFFFFF sbcs x16, x16, x12 sbc x17, x17, x13 tst x10, #1 csel x10, x10, x14, eq csel x11, x11, x15, eq csel x12, x12, x16, eq csel x13, x13, x17, eq mov x2, res stp x10, x11, [x2] stp x12, x13, [x2, #16] // Determine if it is is indeed an inverse square root, also distinguishing // the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to // x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required. add x0, b mov x1, #1 CFI_BL(Lbignum_invsqrt_p25519_nsqr_p25519) mulp(b,a,b) ldp x10, x11, [b] eor x14, x10, #1 ldp x12, x13, [b+16] orr x10, x10, x11 orr x14, x14, x11 orr x12, x12, x13 orr x10, x10, x12 orr x14, x14, x12 cmp x14, xzr mov x0, #1 cneg x0, x0, ne cmp x10, xzr csel x0, x0, xzr, ne // Restore stack and registers CFI_INC_SP(NSPACE) CFI_POP2(x19,x30) CFI_RET S2N_BN_SIZE_DIRECTIVE(bignum_invsqrt_p25519) // ************************************************************* // Local z = x * y // ************************************************************* S2N_BN_FUNCTION_TYPE_DIRECTIVE(Lbignum_invsqrt_p25519_mul_p25519) Lbignum_invsqrt_p25519_mul_p25519: CFI_START ldp x3, x4, [x1] ldp x5, x6, [x2] umull x7, w3, w5 lsr x17, x3, #32 umull x15, w17, w5 lsr x16, x5, #32 umull x8, w16, w17 umull x16, w3, w16 adds x7, x7, x15, lsl #32 lsr x15, x15, #32 adc x8, x8, x15 adds x7, x7, x16, lsl #32 lsr x16, x16, #32 adc x8, x8, x16 mul x9, x4, x6 umulh x10, x4, x6 subs x4, x4, x3 cneg x4, x4, lo csetm x16, lo adds x9, x9, x8 adc x10, x10, xzr subs x3, x5, x6 cneg x3, x3, lo cinv x16, x16, lo mul x15, x4, x3 umulh x3, x4, x3 adds x8, x7, x9 adcs x9, x9, x10 adc x10, x10, xzr cmn x16, #1 eor x15, x15, x16 adcs x8, x15, x8 eor x3, x3, x16 adcs x9, x3, x9 adc x10, x10, x16 ldp x3, x4, [x1, #16] ldp x5, x6, [x2, #16] umull x11, w3, w5 lsr x17, x3, #32 umull x15, w17, w5 lsr x16, x5, #32 umull x12, w16, w17 umull x16, w3, w16 adds x11, x11, x15, lsl #32 lsr x15, x15, #32 adc x12, x12, x15 adds x11, x11, x16, lsl #32 lsr x16, x16, #32 adc x12, x12, x16 mul x13, x4, x6 umulh x14, x4, x6 subs x4, x4, x3 cneg x4, x4, lo csetm x16, lo adds x13, x13, x12 adc x14, x14, xzr subs x3, x5, x6 cneg x3, x3, lo cinv x16, x16, lo mul x15, x4, x3 umulh x3, x4, x3 adds x12, x11, x13 adcs x13, x13, x14 adc x14, x14, xzr cmn x16, #1 eor x15, x15, x16 adcs x12, x15, x12 eor x3, x3, x16 adcs x13, x3, x13 adc x14, x14, x16 ldp x3, x4, [x1, #16] ldp x15, x16, [x1] subs x3, x3, x15 sbcs x4, x4, x16 csetm x16, lo ldp x15, x17, [x2] subs x5, x15, x5 sbcs x6, x17, x6 csetm x17, lo eor x3, x3, x16 subs x3, x3, x16 eor x4, x4, x16 sbc x4, x4, x16 eor x5, x5, x17 subs x5, x5, x17 eor x6, x6, x17 sbc x6, x6, x17 eor x16, x17, x16 adds x11, x11, x9 adcs x12, x12, x10 adcs x13, x13, xzr adc x14, x14, xzr mul x2, x3, x5 umulh x17, x3, x5 mul x15, x4, x6 umulh x1, x4, x6 subs x4, x4, x3 cneg x4, x4, lo csetm x9, lo adds x15, x15, x17 adc x1, x1, xzr subs x6, x5, x6 cneg x6, x6, lo cinv x9, x9, lo mul x5, x4, x6 umulh x6, x4, x6 adds x17, x2, x15 adcs x15, x15, x1 adc x1, x1, xzr cmn x9, #1 eor x5, x5, x9 adcs x17, x5, x17 eor x6, x6, x9 adcs x15, x6, x15 adc x1, x1, x9 adds x9, x11, x7 adcs x10, x12, x8 adcs x11, x13, x11 adcs x12, x14, x12 adcs x13, x13, xzr adc x14, x14, xzr cmn x16, #1 eor x2, x2, x16 adcs x9, x2, x9 eor x17, x17, x16 adcs x10, x17, x10 eor x15, x15, x16 adcs x11, x15, x11 eor x1, x1, x16 adcs x12, x1, x12 adcs x13, x13, x16 adc x14, x14, x16 mov x3, #38 umull x4, w11, w3 add x4, x4, w7, uxtw lsr x7, x7, #32 lsr x11, x11, #32 umaddl x11, w11, w3, x7 mov x7, x4 umull x4, w12, w3 add x4, x4, w8, uxtw lsr x8, x8, #32 lsr x12, x12, #32 umaddl x12, w12, w3, x8 mov x8, x4 umull x4, w13, w3 add x4, x4, w9, uxtw lsr x9, x9, #32 lsr x13, x13, #32 umaddl x13, w13, w3, x9 mov x9, x4 umull x4, w14, w3 add x4, x4, w10, uxtw lsr x10, x10, #32 lsr x14, x14, #32 umaddl x14, w14, w3, x10 mov x10, x4 lsr x17, x14, #31 mov x5, #19 umaddl x5, w5, w17, x5 add x7, x7, x5 adds x7, x7, x11, lsl #32 extr x3, x12, x11, #32 adcs x8, x8, x3 extr x3, x13, x12, #32 adcs x9, x9, x3 extr x3, x14, x13, #32 lsl x5, x17, #63 eor x10, x10, x5 adc x10, x10, x3 mov x3, #19 tst x10, #0x8000000000000000 csel x3, x3, xzr, pl subs x7, x7, x3 sbcs x8, x8, xzr sbcs x9, x9, xzr sbc x10, x10, xzr and x10, x10, #0x7fffffffffffffff stp x7, x8, [x0] stp x9, x10, [x0, #16] CFI_RET S2N_BN_SIZE_DIRECTIVE(Lbignum_invsqrt_p25519_mul_p25519) // ************************************************************* // Local z = 2^n * x // ************************************************************* S2N_BN_FUNCTION_TYPE_DIRECTIVE(Lbignum_invsqrt_p25519_nsqr_p25519) Lbignum_invsqrt_p25519_nsqr_p25519: CFI_START // Copy input argument into [x13;x12;x11;x10] ldp x10, x11, [x2] ldp x12, x13, [x2, #16] // Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and // only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 Lbignum_invsqrt_p25519_loop: umull x2, w10, w10 lsr x14, x10, #32 umull x3, w14, w14 umull x14, w10, w14 adds x2, x2, x14, lsl #33 lsr x14, x14, #31 adc x3, x3, x14 umull x4, w11, w11 lsr x14, x11, #32 umull x5, w14, w14 umull x14, w11, w14 mul x15, x10, x11 umulh x16, x10, x11 adds x4, x4, x14, lsl #33 lsr x14, x14, #31 adc x5, x5, x14 adds x15, x15, x15 adcs x16, x16, x16 adc x5, x5, xzr adds x3, x3, x15 adcs x4, x4, x16 adc x5, x5, xzr umull x6, w12, w12 lsr x14, x12, #32 umull x7, w14, w14 umull x14, w12, w14 adds x6, x6, x14, lsl #33 lsr x14, x14, #31 adc x7, x7, x14 umull x8, w13, w13 lsr x14, x13, #32 umull x9, w14, w14 umull x14, w13, w14 mul x15, x12, x13 umulh x16, x12, x13 adds x8, x8, x14, lsl #33 lsr x14, x14, #31 adc x9, x9, x14 adds x15, x15, x15 adcs x16, x16, x16 adc x9, x9, xzr adds x7, x7, x15 adcs x8, x8, x16 adc x9, x9, xzr subs x10, x10, x12 sbcs x11, x11, x13 csetm x16, lo eor x10, x10, x16 subs x10, x10, x16 eor x11, x11, x16 sbc x11, x11, x16 adds x6, x6, x4 adcs x7, x7, x5 adcs x8, x8, xzr adc x9, x9, xzr umull x12, w10, w10 lsr x5, x10, #32 umull x13, w5, w5 umull x5, w10, w5 adds x12, x12, x5, lsl #33 lsr x5, x5, #31 adc x13, x13, x5 umull x15, w11, w11 lsr x5, x11, #32 umull x14, w5, w5 umull x5, w11, w5 mul x4, x10, x11 umulh x16, x10, x11 adds x15, x15, x5, lsl #33 lsr x5, x5, #31 adc x14, x14, x5 adds x4, x4, x4 adcs x16, x16, x16 adc x14, x14, xzr adds x13, x13, x4 adcs x15, x15, x16 adc x14, x14, xzr adds x4, x2, x6 adcs x5, x3, x7 adcs x6, x6, x8 adcs x7, x7, x9 csetm x16, lo subs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x15 sbcs x7, x7, x14 adcs x8, x8, x16 adc x9, x9, x16 mov x10, #38 umull x12, w6, w10 add x12, x12, w2, uxtw lsr x2, x2, #32 lsr x6, x6, #32 umaddl x6, w6, w10, x2 mov x2, x12 umull x12, w7, w10 add x12, x12, w3, uxtw lsr x3, x3, #32 lsr x7, x7, #32 umaddl x7, w7, w10, x3 mov x3, x12 umull x12, w8, w10 add x12, x12, w4, uxtw lsr x4, x4, #32 lsr x8, x8, #32 umaddl x8, w8, w10, x4 mov x4, x12 umull x12, w9, w10 add x12, x12, w5, uxtw lsr x5, x5, #32 lsr x9, x9, #32 umaddl x9, w9, w10, x5 mov x5, x12 lsr x13, x9, #31 mov x11, #19 umull x11, w11, w13 add x2, x2, x11 adds x10, x2, x6, lsl #32 extr x12, x7, x6, #32 adcs x11, x3, x12 extr x12, x8, x7, #32 adcs x12, x4, x12 extr x14, x9, x8, #32 lsl x15, x13, #63 eor x5, x5, x15 adc x13, x5, x14 // Loop as applicable subs x1, x1, #1 bne Lbignum_invsqrt_p25519_loop // We know the intermediate result x < 2^256 - 38, and now we do strict // modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 // which is equivalent to a "pl" condition. adds x6, x10, #19 adcs x7, x11, xzr adcs x8, x12, xzr adcs x9, x13, xzr csel x10, x10, x6, pl csel x11, x11, x7, pl csel x12, x12, x8, pl csel x13, x13, x9, pl bic x13, x13, #0x8000000000000000 // Copy result back into destination and return stp x10, x11, [x0] stp x12, x13, [x0, #16] CFI_RET S2N_BN_SIZE_DIRECTIVE(Lbignum_invsqrt_p25519_nsqr_p25519) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack, "", %progbits #endif