Two versions of a function that multiplies a unsigned 256 bit integer by constant 977. (for cryptography application) Note __int128_t
type is used in the 2nd function.
#include <stdint.h>#include <stdio.h>// z,z_carry = uint256(y) * 977static inline void multiply_977(uint32_t *y, uint32_t *z) { const uint32_t x = 977; uint32_t high=0; uint64_t prod = x*(uint64_t)(y[0]); z[0] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[1])); z[1] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[2])); z[2] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[3])); z[3] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[4])); z[4] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[5])); z[5] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[6])); z[6] = prod; high=prod>>32; prod = high + (x*(uint64_t)(y[7])); z[7] = prod; high=prod>>32; z[8] = (uint32_t)high; z[9] = 0;}static inline void multiply_977_2(uint32_t *y_, uint32_t *z_) { const uint32_t x = 977; uint64_t *y = (uint64_t*)y_; uint64_t *z = (uint64_t*)z_; uint64_t high=0; __uint128_t prod = x * (__uint128_t)(y[0]); z[0] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[1])); z[1] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[2])); z[2] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[3])); z[3] = prod; high=prod>>64; z[4] = high&0x00000000ffffffff; // z_[8] = (uint64_t)high; z_[9] = 0;}int main(int argc, char** argv) { uint32_t a[10] = {0}; uint32_t b[10] = {0}; a[0] = 1; b[0] = 1; for (int i=0; i<100; ++i) { multiply_977(a, a); multiply_977_2(b, b); } for (int i=0; i<8; ++i) { printf("%08x ", a[i]); } printf("\n"); for (int i=0; i<8; ++i) { printf("%08x ", b[i]); } printf("\n"); return 0;}
output with and without -O3
, results are expected to be identical:
khaotik@KKST2:~/tmp$ gcc a.c; ./a.out2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd khaotik@KKST2:~/tmp$ gcc -O3 a.c; ./a.out2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 00000001 00000000 00000000 00000000 00000000 00000000 00000000 00000000
With -O3
, gcc generates SSE code:
00000000000010a0 <main>: 10a0: f3 0f 1e fa endbr64 10a4: 41 54 push %r12 10a6: 66 0f ef c0 pxor %xmm0,%xmm0 10aa: b9 64 00 00 00 mov $0x64,%ecx 10af: 45 31 c9 xor %r9d,%r9d 10b2: 55 push %rbp 10b3: 31 f6 xor %esi,%esi 10b5: 31 ed xor %ebp,%ebp 10b7: 45 31 db xor %r11d,%r11d 10ba: 53 push %rbx 10bb: 45 31 c0 xor %r8d,%r8d 10be: 31 db xor %ebx,%ebx 10c0: 45 31 d2 xor %r10d,%r10d 10c3: bf 01 00 00 00 mov $0x1,%edi 10c8: 48 83 ec 60 sub $0x60,%rsp 10cc: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax 10d3: 00 00 10d5: 48 89 44 24 58 mov %rax,0x58(%rsp) 10da: 31 c0 xor %eax,%eax 10dc: 0f 29 44 24 30 movaps %xmm0,0x30(%rsp) 10e1: c7 44 24 24 00 00 00 movl $0x0,0x24(%rsp) 10e8: 00 10e9: 48 c7 44 24 50 00 00 movq $0x0,0x50(%rsp) 10f0: 00 00 10f2: c7 44 24 30 01 00 00 movl $0x1,0x30(%rsp) 10f9: 00 10fa: 0f 29 44 24 40 movaps %xmm0,0x40(%rsp) 10ff: 90 nop 1100: 89 f8 mov %edi,%eax 1102: 4d 69 c9 d1 03 00 00 imul $0x3d1,%r9,%r9 1109: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 1110: 89 c7 mov %eax,%edi 1112: 48 c1 e8 20 shr $0x20,%rax 1116: 48 89 c2 mov %rax,%rdx 1119: 44 89 d0 mov %r10d,%eax 111c: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 1123: 48 01 d0 add %rdx,%rax 1126: 41 89 c2 mov %eax,%r10d 1129: 48 c1 e8 20 shr $0x20,%rax 112d: 48 89 c2 mov %rax,%rdx 1130: 44 89 c0 mov %r8d,%eax 1133: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 113a: 48 01 d0 add %rdx,%rax 113d: 41 89 c0 mov %eax,%r8d 1140: 48 c1 e8 20 shr $0x20,%rax 1144: 48 89 c2 mov %rax,%rdx 1147: 44 89 d8 mov %r11d,%eax 114a: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 1151: 48 01 d0 add %rdx,%rax 1154: 41 89 c3 mov %eax,%r11d 1157: 48 c1 e8 20 shr $0x20,%rax 115b: 48 89 c2 mov %rax,%rdx 115e: 89 f0 mov %esi,%eax 1160: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 1167: 48 01 d0 add %rdx,%rax 116a: 89 c6 mov %eax,%esi 116c: 48 c1 e8 20 shr $0x20,%rax 1170: 48 89 c2 mov %rax,%rdx 1173: 89 e8 mov %ebp,%eax 1175: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 117c: 48 01 d0 add %rdx,%rax 117f: 89 c5 mov %eax,%ebp 1181: 48 c1 e8 20 shr $0x20,%rax 1185: 48 89 c2 mov %rax,%rdx 1188: 89 d8 mov %ebx,%eax 118a: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax 1191: 48 01 d0 add %rdx,%rax 1194: 89 c3 mov %eax,%ebx 1196: 48 c1 e8 20 shr $0x20,%rax 119a: 4c 01 c8 add %r9,%rax 119d: 41 89 c1 mov %eax,%r9d 11a0: 83 e9 01 sub $0x1,%ecx 11a3: 0f 85 57 ff ff ff jne 1100 <main+0x60> 11a9: 66 41 0f 6e c8 movd %r8d,%xmm1 11ae: 66 41 0f 6e d3 movd %r11d,%xmm2 11b3: 66 0f 6e c7 movd %edi,%xmm0 11b7: 48 c1 e8 20 shr $0x20,%rax 11bb: 66 41 0f 6e da movd %r10d,%xmm3 11c0: 66 0f 62 ca punpckldq %xmm2,%xmm1 11c4: 66 0f 6e ed movd %ebp,%xmm5 11c8: 89 44 24 20 mov %eax,0x20(%rsp) 11cc: 66 0f 62 c3 punpckldq %xmm3,%xmm0 11d0: 66 41 0f 6e e1 movd %r9d,%xmm4 11d5: 4c 8d 64 24 20 lea 0x20(%rsp),%r12 11da: 66 0f 6c c1 punpcklqdq %xmm1,%xmm0 11de: 66 0f 6e cb movd %ebx,%xmm1 11e2: 48 8d 2d 1b 0e 00 00 lea 0xe1b(%rip),%rbp # 2004 <_IO_stdin_used+0x4> 11e9: 48 89 e3 mov %rsp,%rbx 11ec: 0f 29 04 24 movaps %xmm0,(%rsp) 11f0: 66 0f 6e c6 movd %esi,%xmm0 11f4: 66 0f 62 cc punpckldq %xmm4,%xmm1 11f8: 66 0f 62 c5 punpckldq %xmm5,%xmm0 11fc: 66 0f 6c c1 punpcklqdq %xmm1,%xmm0 1200: 0f 29 44 24 10 movaps %xmm0,0x10(%rsp) 1205: 0f 1f 00 nopl (%rax) 1208: 8b 13 mov (%rbx),%edx 120a: 48 89 ee mov %rbp,%rsi 120d: bf 01 00 00 00 mov $0x1,%edi 1212: 31 c0 xor %eax,%eax 1214: 48 83 c3 04 add $0x4,%rbx 1218: e8 73 fe ff ff callq 1090 <__printf_chk@plt> 121d: 49 39 dc cmp %rbx,%r12 1220: 75 e6 jne 1208 <main+0x168> 1222: bf 0a 00 00 00 mov $0xa,%edi 1227: 48 8d 5c 24 30 lea 0x30(%rsp),%rbx 122c: 4c 8d 64 24 50 lea 0x50(%rsp),%r12 1231: e8 3a fe ff ff callq 1070 <putchar@plt> 1236: 48 8d 2d c7 0d 00 00 lea 0xdc7(%rip),%rbp # 2004 <_IO_stdin_used+0x4> 123d: 0f 1f 00 nopl (%rax) 1240: 8b 13 mov (%rbx),%edx 1242: 48 89 ee mov %rbp,%rsi 1245: bf 01 00 00 00 mov $0x1,%edi 124a: 31 c0 xor %eax,%eax 124c: 48 83 c3 04 add $0x4,%rbx 1250: e8 3b fe ff ff callq 1090 <__printf_chk@plt> 1255: 49 39 dc cmp %rbx,%r12 1258: 75 e6 jne 1240 <main+0x1a0> 125a: bf 0a 00 00 00 mov $0xa,%edi 125f: e8 0c fe ff ff callq 1070 <putchar@plt> 1264: 48 8b 44 24 58 mov 0x58(%rsp),%rax 1269: 64 48 33 04 25 28 00 xor %fs:0x28,%rax 1270: 00 00 1272: 75 0b jne 127f <main+0x1df> 1274: 48 83 c4 60 add $0x60,%rsp 1278: 31 c0 xor %eax,%eax 127a: 5b pop %rbx 127b: 5d pop %rbp 127c: 41 5c pop %r12 127e: c3 retq 127f: e8 fc fd ff ff callq 1080 <__stack_chk_fail@plt> 1284: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 128b: 00 00 00 128e: 66 90 xchg %ax,%ax
However I'm not asm-savvy enough to figure out the problem.
compiler version and cpu flags:
khaotik@KKST2:~/tmp$ gcc --versiongcc (Ubuntu 9.3.0-10ubuntu2) 9.3.0Copyright (C) 2019 Free Software Foundation, Inc.This is free software; see the source for copying conditions. There is NOwarranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.khaotik@KKST2:~/tmp$ lscpu | grep FlagsFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d