// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // // extern uint64_t edwards25519_decode_alt(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as // 2^255 * x_0 + y where x_0 is the least significant bit of x. It // returns the full pair of coordinates x (at z) and y (at z+4). The // return code is 0 for success and 1 for failure, which means that // the input does not correspond to the encoding of any edwards25519 // point. This can happen for three reasons, where y = the lowest // 255 bits of the input: // // * y >= p_25519 // Input y coordinate is not reduced // * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root // There is no x such that (x,y) is on the curve // * y^2 = 1 and top bit of input is set // Cannot be the canonical encoding of (0,1) or (0,-1) // // Standard ARM ABI: X0 = z, X1 = c // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum_arm.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode_alt) S2N_BN_FUNCTION_TYPE_DIRECTIVE(edwards25519_decode_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode_alt) .text .balign 4 // Size in bytes of a 64-bit word #define N 8 // Pointer-offset pairs for temporaries on stack #define y sp, #0 #define s sp, #(4*N) #define t sp, #(8*N) #define u sp, #(12*N) #define v sp, #(16*N) #define w sp, #(20*N) // Other temporary variables in register #define res x19 #define sgnbit x20 #define badun x21 // Total size to reserve on the stack #define NSPACE 24*N // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ movz nn, n0 __LF \ movk nn, n1, lsl #16 __LF \ movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macros wrapping up calls to the local subroutines #define mulp(dest,src1,src2) \ add x0, dest __LF \ add x1, src1 __LF \ add x2, src2 __LF \ CFI_BL(Ledwards25519_decode_alt_mul_p25519) #define nsqr(dest,n,src) \ add x0, dest __LF \ mov x1, n __LF \ add x2, src __LF \ CFI_BL(Ledwards25519_decode_alt_nsqr_p25519) S2N_BN_SYMBOL(edwards25519_decode_alt): CFI_START // Save registers and make room for temporaries CFI_PUSH2(x19,x20) CFI_PUSH2(x21,x30) CFI_DEC_SP(NSPACE) // Save the return pointer for the end so we can overwrite x0 later mov res, x0 // Load the inputs, using byte operations in case of big-endian setting. // Let y be the lowest 255 bits of the input and sgnbit the desired parity. // If y >= p_25519 then already flag the input as invalid (badun = 1). ldrb w0, [x1] lsl x4, x0, #56 ldrb w0, [x1, #1] extr x4, x0, x4, #8 ldrb w0, [x1, #2] extr x4, x0, x4, #8 ldrb w0, [x1, #3] extr x4, x0, x4, #8 ldrb w0, [x1, #4] extr x4, x0, x4, #8 ldrb w0, [x1, #5] extr x4, x0, x4, #8 ldrb w0, [x1, #6] extr x4, x0, x4, #8 ldrb w0, [x1, #7] extr x4, x0, x4, #8 ldrb w0, [x1, #8] lsl x5, x0, #56 ldrb w0, [x1, #9] extr x5, x0, x5, #8 ldrb w0, [x1, #10] extr x5, x0, x5, #8 ldrb w0, [x1, #11] extr x5, x0, x5, #8 ldrb w0, [x1, #12] extr x5, x0, x5, #8 ldrb w0, [x1, #13] extr x5, x0, x5, #8 ldrb w0, [x1, #14] extr x5, x0, x5, #8 ldrb w0, [x1, #15] extr x5, x0, x5, #8 ldrb w0, [x1, #16] lsl x6, x0, #56 ldrb w0, [x1, #17] extr x6, x0, x6, #8 ldrb w0, [x1, #18] extr x6, x0, x6, #8 ldrb w0, [x1, #19] extr x6, x0, x6, #8 ldrb w0, [x1, #20] extr x6, x0, x6, #8 ldrb w0, [x1, #21] extr x6, x0, x6, #8 ldrb w0, [x1, #22] extr x6, x0, x6, #8 ldrb w0, [x1, #23] extr x6, x0, x6, #8 ldrb w0, [x1, #24] lsl x7, x0, #56 ldrb w0, [x1, #25] extr x7, x0, x7, #8 ldrb w0, [x1, #26] extr x7, x0, x7, #8 ldrb w0, [x1, #27] extr x7, x0, x7, #8 ldrb w0, [x1, #28] extr x7, x0, x7, #8 ldrb w0, [x1, #29] extr x7, x0, x7, #8 ldrb w0, [x1, #30] extr x7, x0, x7, #8 ldrb w0, [x1, #31] extr x7, x0, x7, #8 stp x4, x5, [y] lsr sgnbit, x7, #63 and x7, x7, #0x7FFFFFFFFFFFFFFF stp x6, x7, [y+16] adds xzr, x4, #19 adcs xzr, x5, xzr adcs xzr, x6, xzr adcs xzr, x7, xzr cset badun, mi // u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) // v = 1 + d * y^2 (not reduced modulo from the +1) // w = u * v nsqr(v,1,y) ldp x0, x1, [v] ldp x2, x3, [v+16] mov x4, #0x8000000000000000 subs x0, x0, #20 sbcs x1, x1, xzr sbcs x2, x2, xzr sbc x3, x3, x4 stp x0, x1, [u] stp x2, x3, [u+16] movbig(x0,#0x75eb,#0x4dca,#0x1359,#0x78a3) movbig(x1,#0x0070,#0x0a4d,#0x4141,#0xd8ab) movbig(x2,#0x8cc7,#0x4079,#0x7779,#0xe898) movbig(x3,#0x5203,#0x6cee,#0x2b6f,#0xfe73) stp x0, x1, [w] stp x2, x3, [w+16] mulp(v,w,v) ldp x0, x1, [v] ldp x2, x3, [v+16] adds x0, x0, #1 adcs x1, x1, xzr adcs x2, x2, xzr adcs x3, x3, xzr stp x0, x1, [v] stp x2, x3, [v+16] mulp(w,u,v) // Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). // This power tower computation is the same as bignum_invsqrt_p25519 nsqr(t,1,w) mulp(t,t,w) nsqr(s,2,t) mulp(t,s,t) nsqr(s,1,t) mulp(v,s,w) nsqr(s,5,v) mulp(t,s,v) nsqr(s,10,t) mulp(t,s,t) nsqr(s,5,t) mulp(v,s,v) nsqr(s,25,v) mulp(t,s,v) nsqr(s,50,t) mulp(t,s,t) nsqr(s,25,t) mulp(v,s,v) nsqr(s,125,v) mulp(v,s,v) nsqr(s,2,v) mulp(s,s,w) // Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) // exists, in which case we should get 0, 1 or -1. nsqr(v,1,s) mulp(v,v,w) // Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} // and the other being t = s * j_25519 where j_25519 = sqrt(-1). mulp(s,u,s) movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) stp x0, x1, [t] stp x2, x3, [t+16] mulp(t,s,t) // x4 = 0 <=> s^2 * w = 0 or 1 ldp x0, x1, [v] ldp x2, x3, [v+16] bic x4, x0, #1 orr x4, x4, x1 orr x5, x2, x3 orr x4, x4, x5 // x0 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) add x0, x0, #20 add x1, x1, #1 orr x0, x0, x1 add x2, x2, #1 eor x3, x3, #0x7FFFFFFFFFFFFFFF orr x2, x2, x3 orr x0, x0, x2 // If s^2 * w is not 0 or 1 then replace s by t cmp x4, xzr ldp x10, x11, [s] ldp x14, x15, [t] csel x10, x10, x14, eq csel x11, x11, x15, eq ldp x12, x13, [s+16] ldp x16, x17, [t+16] csel x12, x12, x16, eq csel x13, x13, x17, eq stp x10, x11, [s] stp x12, x13, [s+16] // Check invalidity, occurring if s^2 * w is not in {0,1,-1} ccmp x0, xzr, 4, ne cset x0, ne orr badun, badun, x0 // Let [x3;x2;x1;x0] = s and [x7;x6;x5;x4] = p_25519 - s ldp x0, x1, [s] ldp x2, x3, [s+16] mov x4, #-19 subs x4, x4, x0 mov x6, #-1 sbcs x5, x6, x1 sbcs x6, x6, x2 mov x7, #0x7FFFFFFFFFFFFFFF sbc x7, x7, x3 // Decide whether a flip is apparently indicated, s_0 <=> sgnbit // Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: // - if s = 0 then mark as invalid // - if s <> 0 then indeed flip and x9, x0, #1 eor sgnbit, x9, sgnbit orr x8, x0, x1 orr x9, x2, x3 orr x8, x8, x9 orr x10, badun, sgnbit cmp x8, xzr csel badun, x10, badun, eq ccmp sgnbit, xzr, #4, ne // Actual selection of x as s or -s, copying of y and return of validity csel x0, x0, x4, eq csel x1, x1, x5, eq csel x2, x2, x6, eq csel x3, x3, x7, eq ldp x8, x9, [y] ldp x10, x11, [y+16] stp x0, x1, [res] stp x2, x3, [res, #16] stp x8, x9, [res, #32] stp x10, x11, [res, #48] mov x0, badun // Restore stack and registers CFI_INC_SP(NSPACE) CFI_POP2(x21,x30) CFI_POP2(x19,x20) CFI_RET S2N_BN_SIZE_DIRECTIVE(edwards25519_decode_alt) // ************************************************************* // Local z = x * y // ************************************************************* S2N_BN_FUNCTION_TYPE_DIRECTIVE(Ledwards25519_decode_alt_mul_p25519) Ledwards25519_decode_alt_mul_p25519: CFI_START ldp x3, x4, [x1] ldp x7, x8, [x2] mul x12, x3, x7 umulh x13, x3, x7 mul x11, x3, x8 umulh x14, x3, x8 adds x13, x13, x11 ldp x9, x10, [x2, #16] mul x11, x3, x9 umulh x15, x3, x9 adcs x14, x14, x11 mul x11, x3, x10 umulh x16, x3, x10 adcs x15, x15, x11 adc x16, x16, xzr ldp x5, x6, [x1, #16] mul x11, x4, x7 adds x13, x13, x11 mul x11, x4, x8 adcs x14, x14, x11 mul x11, x4, x9 adcs x15, x15, x11 mul x11, x4, x10 adcs x16, x16, x11 umulh x3, x4, x10 adc x3, x3, xzr umulh x11, x4, x7 adds x14, x14, x11 umulh x11, x4, x8 adcs x15, x15, x11 umulh x11, x4, x9 adcs x16, x16, x11 adc x3, x3, xzr mul x11, x5, x7 adds x14, x14, x11 mul x11, x5, x8 adcs x15, x15, x11 mul x11, x5, x9 adcs x16, x16, x11 mul x11, x5, x10 adcs x3, x3, x11 umulh x4, x5, x10 adc x4, x4, xzr umulh x11, x5, x7 adds x15, x15, x11 umulh x11, x5, x8 adcs x16, x16, x11 umulh x11, x5, x9 adcs x3, x3, x11 adc x4, x4, xzr mul x11, x6, x7 adds x15, x15, x11 mul x11, x6, x8 adcs x16, x16, x11 mul x11, x6, x9 adcs x3, x3, x11 mul x11, x6, x10 adcs x4, x4, x11 umulh x5, x6, x10 adc x5, x5, xzr umulh x11, x6, x7 adds x16, x16, x11 umulh x11, x6, x8 adcs x3, x3, x11 umulh x11, x6, x9 adcs x4, x4, x11 adc x5, x5, xzr mov x7, #38 mul x11, x7, x16 umulh x9, x7, x16 adds x12, x12, x11 mul x11, x7, x3 umulh x3, x7, x3 adcs x13, x13, x11 mul x11, x7, x4 umulh x4, x7, x4 adcs x14, x14, x11 mul x11, x7, x5 umulh x5, x7, x5 adcs x15, x15, x11 cset x16, hs adds x15, x15, x4 adc x16, x16, x5 cmn x15, x15 orr x15, x15, #0x8000000000000000 adc x8, x16, x16 mov x7, #19 madd x11, x7, x8, x7 adds x12, x12, x11 adcs x13, x13, x9 adcs x14, x14, x3 adcs x15, x15, xzr csel x7, x7, xzr, lo subs x12, x12, x7 sbcs x13, x13, xzr sbcs x14, x14, xzr sbc x15, x15, xzr and x15, x15, #0x7fffffffffffffff stp x12, x13, [x0] stp x14, x15, [x0, #16] CFI_RET S2N_BN_SIZE_DIRECTIVE(Ledwards25519_decode_alt_mul_p25519) // ************************************************************* // Local z = 2^n * x // ************************************************************* S2N_BN_FUNCTION_TYPE_DIRECTIVE(Ledwards25519_decode_alt_nsqr_p25519) Ledwards25519_decode_alt_nsqr_p25519: CFI_START // Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20 ldp x6, x3, [x2] ldp x4, x5, [x2, #16] mov x2, x6 // Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and // only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 Ledwards25519_decode_alt_loop: mul x9, x2, x3 umulh x10, x2, x3 mul x11, x2, x5 umulh x12, x2, x5 mul x7, x2, x4 umulh x6, x2, x4 adds x10, x10, x7 adcs x11, x11, x6 mul x7, x3, x4 umulh x6, x3, x4 adc x6, x6, xzr adds x11, x11, x7 mul x13, x4, x5 umulh x14, x4, x5 adcs x12, x12, x6 mul x7, x3, x5 umulh x6, x3, x5 adc x6, x6, xzr adds x12, x12, x7 adcs x13, x13, x6 adc x14, x14, xzr adds x9, x9, x9 adcs x10, x10, x10 adcs x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 cset x6, hs umulh x7, x2, x2 mul x8, x2, x2 adds x9, x9, x7 mul x7, x3, x3 adcs x10, x10, x7 umulh x7, x3, x3 adcs x11, x11, x7 mul x7, x4, x4 adcs x12, x12, x7 umulh x7, x4, x4 adcs x13, x13, x7 mul x7, x5, x5 adcs x14, x14, x7 umulh x7, x5, x5 adc x6, x6, x7 mov x3, #38 mul x7, x3, x12 umulh x4, x3, x12 adds x8, x8, x7 mul x7, x3, x13 umulh x13, x3, x13 adcs x9, x9, x7 mul x7, x3, x14 umulh x14, x3, x14 adcs x10, x10, x7 mul x7, x3, x6 umulh x6, x3, x6 adcs x11, x11, x7 cset x12, hs adds x11, x11, x14 adc x12, x12, x6 cmn x11, x11 bic x11, x11, #0x8000000000000000 adc x2, x12, x12 mov x3, #0x13 mul x7, x3, x2 adds x2, x8, x7 adcs x3, x9, x4 adcs x4, x10, x13 adc x5, x11, xzr // Loop as applicable subs x1, x1, #1 bne Ledwards25519_decode_alt_loop // We know the intermediate result x < 2^256 - 38, and now we do strict // modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 // which is equivalent to a "pl" condition. adds x6, x2, #19 adcs x7, x3, xzr adcs x8, x4, xzr adcs x9, x5, xzr csel x2, x2, x6, pl csel x3, x3, x7, pl csel x4, x4, x8, pl csel x5, x5, x9, pl bic x5, x5, #0x8000000000000000 // Copy result back into destination and return stp x2, x3, [x0] stp x4, x5, [x0, #16] CFI_RET S2N_BN_SIZE_DIRECTIVE(Ledwards25519_decode_alt_nsqr_p25519) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack, "", %progbits #endif