- The function computes r ≡ a × b (mod p)
- a, b, r, and p are 256-bit integers
- p = 2256 - 0x1000003D1
void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t *b, uint32_t *r) { uint64_t c = (uint64_t)a[0] * b[0]; uint32_t t0 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[1] + (uint64_t)a[1] * b[0]; uint32_t t1 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[2] + (uint64_t)a[1] * b[1] + (uint64_t)a[2] * b[0]; uint32_t t2 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[3] + (uint64_t)a[1] * b[2] + (uint64_t)a[2] * b[1] + (uint64_t)a[3] * b[0]; uint32_t t3 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[4] + (uint64_t)a[1] * b[3] + (uint64_t)a[2] * b[2] + (uint64_t)a[3] * b[1] + (uint64_t)a[4] * b[0]; uint32_t t4 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[5] + (uint64_t)a[1] * b[4] + (uint64_t)a[2] * b[3] + (uint64_t)a[3] * b[2] + (uint64_t)a[4] * b[1] + (uint64_t)a[5] * b[0]; uint32_t t5 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[6] + (uint64_t)a[1] * b[5] + (uint64_t)a[2] * b[4] + (uint64_t)a[3] * b[3] + (uint64_t)a[4] * b[2] + (uint64_t)a[5] * b[1] + (uint64_t)a[6] * b[0]; uint32_t t6 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[7] + (uint64_t)a[1] * b[6] + (uint64_t)a[2] * b[5] + (uint64_t)a[3] * b[4] + (uint64_t)a[4] * b[3] + (uint64_t)a[5] * b[2] + (uint64_t)a[6] * b[1] + (uint64_t)a[7] * b[0]; uint32_t t7 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[8] + (uint64_t)a[1] * b[7] + (uint64_t)a[2] * b[6] + (uint64_t)a[3] * b[5] + (uint64_t)a[4] * b[4] + (uint64_t)a[5] * b[3] + (uint64_t)a[6] * b[2] + (uint64_t)a[7] * b[1] + (uint64_t)a[8] * b[0]; uint32_t t8 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[0] * b[9] + (uint64_t)a[1] * b[8] + (uint64_t)a[2] * b[7] + (uint64_t)a[3] * b[6] + (uint64_t)a[4] * b[5] + (uint64_t)a[5] * b[4] + (uint64_t)a[6] * b[3] + (uint64_t)a[7] * b[2] + (uint64_t)a[8] * b[1] + (uint64_t)a[9] * b[0]; uint32_t t9 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[1] * b[9] + (uint64_t)a[2] * b[8] + (uint64_t)a[3] * b[7] + (uint64_t)a[4] * b[6] + (uint64_t)a[5] * b[5] + (uint64_t)a[6] * b[4] + (uint64_t)a[7] * b[3] + (uint64_t)a[8] * b[2] + (uint64_t)a[9] * b[1]; uint32_t t10 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[2] * b[9] + (uint64_t)a[3] * b[8] + (uint64_t)a[4] * b[7] + (uint64_t)a[5] * b[6] + (uint64_t)a[6] * b[5] + (uint64_t)a[7] * b[4] + (uint64_t)a[8] * b[3] + (uint64_t)a[9] * b[2]; uint32_t t11 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[3] * b[9] + (uint64_t)a[4] * b[8] + (uint64_t)a[5] * b[7] + (uint64_t)a[6] * b[6] + (uint64_t)a[7] * b[5] + (uint64_t)a[8] * b[4] + (uint64_t)a[9] * b[3]; uint32_t t12 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[4] * b[9] + (uint64_t)a[5] * b[8] + (uint64_t)a[6] * b[7] + (uint64_t)a[7] * b[6] + (uint64_t)a[8] * b[5] + (uint64_t)a[9] * b[4]; uint32_t t13 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[5] * b[9] + (uint64_t)a[6] * b[8] + (uint64_t)a[7] * b[7] + (uint64_t)a[8] * b[6] + (uint64_t)a[9] * b[5]; uint32_t t14 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[6] * b[9] + (uint64_t)a[7] * b[8] + (uint64_t)a[8] * b[7] + (uint64_t)a[9] * b[6]; uint32_t t15 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[7] * b[9] + (uint64_t)a[8] * b[8] + (uint64_t)a[9] * b[7]; uint32_t t16 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[8] * b[9] + (uint64_t)a[9] * b[8]; uint32_t t17 = c & 0x3FFFFFFUL; c = c >> 26; c = c + (uint64_t)a[9] * b[9]; uint32_t t18 = c & 0x3FFFFFFUL; c = c >> 26; uint32_t t19 = c; c = t0 + (uint64_t)t10 * 0x3D10UL; t0 = c & 0x3FFFFFFUL; c = c >> 26; c = c + t1 + (uint64_t)t10*0x400UL + (uint64_t)t11 * 0x3D10UL; t1 = c & 0x3FFFFFFUL; c = c >> 26; c = c + t2 + (uint64_t)t11*0x400UL + (uint64_t)t12 * 0x3D10UL; t2 = c & 0x3FFFFFFUL; c = c >> 26; c = c + t3 + (uint64_t)t12*0x400UL + (uint64_t)t13 * 0x3D10UL; r[3] = c & 0x3FFFFFFUL; c = c >> 26; c = c + t4 + (uint64_t)t13*0x400UL + (uint64_t)t14 * 0x3D10UL; r[4] = c & 0x3FFFFFFUL; c = c >> 26; c = c + t5 + (uint64_t)t14*0x400UL + (uint64_t)t15 * 0x3D10UL; r[5] = c & 0x3FFFFFFUL; c = c >> 26; c = c + t6 + (uint64_t)t15*0x400UL + (uint64_t)t16 * 0x3D10UL; r[6] = c & 0x3FFFFFFUL; c = c >> 26; c = c + t7 + (uint64_t)t16*0x400UL + (uint64_t)t17 * 0x3D10UL; r[7] = c & 0x3FFFFFFUL; c = c >> 26; c = c + t8 + (uint64_t)t17*0x400UL + (uint64_t)t18 * 0x3D10UL; r[8] = c & 0x3FFFFFFUL; c = c >> 26; c = c + t9 + (uint64_t)t18*0x400UL + (uint64_t)t19 * 0x1000003D10UL; r[9] = c & 0x03FFFFFUL; c = c >> 22; uint64_t d = t0 + c * 0x3D1UL; r[0] = d & 0x3FFFFFFUL; d = d >> 26; d = d + t1 + c*0x40; r[1] = d & 0x3FFFFFFUL; d = d >> 26; r[2] = t2 + d; }
Mathematics:
We can represent a and b in this way:a = a0 + a1 × 226 + a2 × 252 + a3 × 278 + … + a7 × 2182 + a8 × 2208 + a9 × 2234
b = b0 + b1 × 226 + b2 × 252 + b3 × 278 + … + b7 × 2182 + b8 × 2208 + b9 × 2234
Where a0..8 and b0..8 are 26-bit integers, a9 and b9 are 22-bit integers
The coefficients of t are:
t0 = a0b0t1 = a0b1 + a1b0 + t0's carry
t2 = a0b2 + a1b1 + a2b0 + t1's carry
t3 = a0b3 + a1b2 + a2b1 + a3b0 + t2's carry
…
t9 = a0b9 + a1b8 + a2b7 + a3b6 + a4b5 + a5b4 + a6b3 + a7b2 + a8b1 + a9b0 + t8's carry
…
t17 = a8b9 + a9b8 + t16's carry
t18 = a9b9 + t17's carry
t19 = t18's carry
Reduced Representation:
p = 2256 - 0x1000003D1∵ 2256 − 0x1000003D1 ≡ 0 (mod p)
∴ 2256 ≡ 0x1000003D1 (mod p)
a×b = t0 + t1 × 226 + t2 × 252 + t3 × 278 + … + t9 × 2234 + … + t17 × 2442 + t18 × 2468 + t19 × 2494
∵ 2256 ≡ 0x1000003D1 (mod p), and by compatibility with scaling property, we can rewrite t10..t19 in this way:
t10 × 2260 = t10 × 24 × 2256 ≡ t10 × 24 × 0x1000003D1 (mod p)
t11 × 2286 = t11 × 230 × 2256 ≡ t11 × 230 × 0x1000003D1 (mod p)
t12 × 2312 = t12 × 256 × 2256 ≡ t12 × 256 × 0x1000003D1 (mod p)
…
t19 × 2494 = t19 × 2238 × 2256 ≡ t19 × 2238 × 0x1000003D1 (mod p)
by compatibility with addition property, we can reduce modulo p as
r0 = t0 + t10 × 24 × 0x1000003D1 (mod p), but r0 only hold lower 26-bits, so we leave 0x100000000 to r1. So:
r0 = t0 + t10 × 24 × 0x3D1 (mod p)
r1 = t1 + t10 × 210 + t11 × 24 × 0x3D1 + r0's carry (mod p)
r2 = t2 + t11 × 210 + t12 × 24 × 0x3D1 + r1's carry (mod p)
r3 = t3 + t12 × 210 + t13 × 24 × 0x3D1 + r2's carry (mod p)
…
r8 = t8 + t17 × 210 + t18 × 24 × 0x3D1 + r7's carry (mod p)
r9 = t9 + t18 × 210 + t19 × 0x1000003D10 + r8's carry (mod p)
2256 ≡ 0x1000003D1 (mod p)
Good reference:
Especially thanks Shen-Ta Hsieh for helping me understand the magics =)
No comments:
Post a Comment