Quantcast
Channel: Active questions tagged gcc - Stack Overflow
Viewing all articles
Browse latest Browse all 22040

gcc turning on -O3 causes bug on 128 bit integer [duplicate]

$
0
0

Two versions of a function that multiplies a unsigned 256 bit integer by constant 977. (for cryptography application) Note __int128_t type is used in the 2nd function.

#include <stdint.h>#include <stdio.h>// z,z_carry = uint256(y) * 977static inline void multiply_977(uint32_t *y, uint32_t *z) {  const uint32_t x = 977;  uint32_t high=0;  uint64_t prod = x*(uint64_t)(y[0]); z[0] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[1])); z[1] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[2])); z[2] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[3])); z[3] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[4])); z[4] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[5])); z[5] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[6])); z[6] = prod;  high=prod>>32; prod = high + (x*(uint64_t)(y[7])); z[7] = prod;  high=prod>>32; z[8] = (uint32_t)high; z[9] = 0;}static inline void multiply_977_2(uint32_t *y_, uint32_t *z_) {  const uint32_t x = 977;  uint64_t *y = (uint64_t*)y_;  uint64_t *z = (uint64_t*)z_;  uint64_t high=0;  __uint128_t prod = x * (__uint128_t)(y[0]);  z[0] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[1]));  z[1] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[2]));  z[2] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[3]));  z[3] = prod; high=prod>>64;  z[4] = high&0x00000000ffffffff;  // z_[8] = (uint64_t)high; z_[9] = 0;}int main(int argc, char** argv) {  uint32_t a[10] = {0};  uint32_t b[10] = {0};  a[0] = 1;  b[0] = 1;  for (int i=0; i<100; ++i) {    multiply_977(a, a);    multiply_977_2(b, b);  }  for (int i=0; i<8; ++i) { printf("%08x ", a[i]); }  printf("\n");  for (int i=0; i<8; ++i) { printf("%08x ", b[i]); }  printf("\n");  return 0;}

output with and without -O3, results are expected to be identical:

khaotik@KKST2:~/tmp$ gcc a.c; ./a.out2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd khaotik@KKST2:~/tmp$ gcc -O3 a.c; ./a.out2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 00000001 00000000 00000000 00000000 00000000 00000000 00000000 00000000 

With -O3, gcc generates SSE code:

00000000000010a0 <main>:    10a0:   f3 0f 1e fa             endbr64     10a4:   41 54                   push   %r12    10a6:   66 0f ef c0             pxor   %xmm0,%xmm0    10aa:   b9 64 00 00 00          mov    $0x64,%ecx    10af:   45 31 c9                xor    %r9d,%r9d    10b2:   55                      push   %rbp    10b3:   31 f6                   xor    %esi,%esi    10b5:   31 ed                   xor    %ebp,%ebp    10b7:   45 31 db                xor    %r11d,%r11d    10ba:   53                      push   %rbx    10bb:   45 31 c0                xor    %r8d,%r8d    10be:   31 db                   xor    %ebx,%ebx    10c0:   45 31 d2                xor    %r10d,%r10d    10c3:   bf 01 00 00 00          mov    $0x1,%edi    10c8:   48 83 ec 60             sub    $0x60,%rsp    10cc:   64 48 8b 04 25 28 00    mov    %fs:0x28,%rax    10d3:   00 00     10d5:   48 89 44 24 58          mov    %rax,0x58(%rsp)    10da:   31 c0                   xor    %eax,%eax    10dc:   0f 29 44 24 30          movaps %xmm0,0x30(%rsp)    10e1:   c7 44 24 24 00 00 00    movl   $0x0,0x24(%rsp)    10e8:   00     10e9:   48 c7 44 24 50 00 00    movq   $0x0,0x50(%rsp)    10f0:   00 00     10f2:   c7 44 24 30 01 00 00    movl   $0x1,0x30(%rsp)    10f9:   00     10fa:   0f 29 44 24 40          movaps %xmm0,0x40(%rsp)    10ff:   90                      nop    1100:   89 f8                   mov    %edi,%eax    1102:   4d 69 c9 d1 03 00 00    imul   $0x3d1,%r9,%r9    1109:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    1110:   89 c7                   mov    %eax,%edi    1112:   48 c1 e8 20             shr    $0x20,%rax    1116:   48 89 c2                mov    %rax,%rdx    1119:   44 89 d0                mov    %r10d,%eax    111c:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    1123:   48 01 d0                add    %rdx,%rax    1126:   41 89 c2                mov    %eax,%r10d    1129:   48 c1 e8 20             shr    $0x20,%rax    112d:   48 89 c2                mov    %rax,%rdx    1130:   44 89 c0                mov    %r8d,%eax    1133:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    113a:   48 01 d0                add    %rdx,%rax    113d:   41 89 c0                mov    %eax,%r8d    1140:   48 c1 e8 20             shr    $0x20,%rax    1144:   48 89 c2                mov    %rax,%rdx    1147:   44 89 d8                mov    %r11d,%eax    114a:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    1151:   48 01 d0                add    %rdx,%rax    1154:   41 89 c3                mov    %eax,%r11d    1157:   48 c1 e8 20             shr    $0x20,%rax    115b:   48 89 c2                mov    %rax,%rdx    115e:   89 f0                   mov    %esi,%eax    1160:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    1167:   48 01 d0                add    %rdx,%rax    116a:   89 c6                   mov    %eax,%esi    116c:   48 c1 e8 20             shr    $0x20,%rax    1170:   48 89 c2                mov    %rax,%rdx    1173:   89 e8                   mov    %ebp,%eax    1175:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    117c:   48 01 d0                add    %rdx,%rax    117f:   89 c5                   mov    %eax,%ebp    1181:   48 c1 e8 20             shr    $0x20,%rax    1185:   48 89 c2                mov    %rax,%rdx    1188:   89 d8                   mov    %ebx,%eax    118a:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax    1191:   48 01 d0                add    %rdx,%rax    1194:   89 c3                   mov    %eax,%ebx    1196:   48 c1 e8 20             shr    $0x20,%rax    119a:   4c 01 c8                add    %r9,%rax    119d:   41 89 c1                mov    %eax,%r9d    11a0:   83 e9 01                sub    $0x1,%ecx    11a3:   0f 85 57 ff ff ff       jne    1100 <main+0x60>    11a9:   66 41 0f 6e c8          movd   %r8d,%xmm1    11ae:   66 41 0f 6e d3          movd   %r11d,%xmm2    11b3:   66 0f 6e c7             movd   %edi,%xmm0    11b7:   48 c1 e8 20             shr    $0x20,%rax    11bb:   66 41 0f 6e da          movd   %r10d,%xmm3    11c0:   66 0f 62 ca             punpckldq %xmm2,%xmm1    11c4:   66 0f 6e ed             movd   %ebp,%xmm5    11c8:   89 44 24 20             mov    %eax,0x20(%rsp)    11cc:   66 0f 62 c3             punpckldq %xmm3,%xmm0    11d0:   66 41 0f 6e e1          movd   %r9d,%xmm4    11d5:   4c 8d 64 24 20          lea    0x20(%rsp),%r12    11da:   66 0f 6c c1             punpcklqdq %xmm1,%xmm0    11de:   66 0f 6e cb             movd   %ebx,%xmm1    11e2:   48 8d 2d 1b 0e 00 00    lea    0xe1b(%rip),%rbp        # 2004 <_IO_stdin_used+0x4>    11e9:   48 89 e3                mov    %rsp,%rbx    11ec:   0f 29 04 24             movaps %xmm0,(%rsp)    11f0:   66 0f 6e c6             movd   %esi,%xmm0    11f4:   66 0f 62 cc             punpckldq %xmm4,%xmm1    11f8:   66 0f 62 c5             punpckldq %xmm5,%xmm0    11fc:   66 0f 6c c1             punpcklqdq %xmm1,%xmm0    1200:   0f 29 44 24 10          movaps %xmm0,0x10(%rsp)    1205:   0f 1f 00                nopl   (%rax)    1208:   8b 13                   mov    (%rbx),%edx    120a:   48 89 ee                mov    %rbp,%rsi    120d:   bf 01 00 00 00          mov    $0x1,%edi    1212:   31 c0                   xor    %eax,%eax    1214:   48 83 c3 04             add    $0x4,%rbx    1218:   e8 73 fe ff ff          callq  1090 <__printf_chk@plt>    121d:   49 39 dc                cmp    %rbx,%r12    1220:   75 e6                   jne    1208 <main+0x168>    1222:   bf 0a 00 00 00          mov    $0xa,%edi    1227:   48 8d 5c 24 30          lea    0x30(%rsp),%rbx    122c:   4c 8d 64 24 50          lea    0x50(%rsp),%r12    1231:   e8 3a fe ff ff          callq  1070 <putchar@plt>    1236:   48 8d 2d c7 0d 00 00    lea    0xdc7(%rip),%rbp        # 2004 <_IO_stdin_used+0x4>    123d:   0f 1f 00                nopl   (%rax)    1240:   8b 13                   mov    (%rbx),%edx    1242:   48 89 ee                mov    %rbp,%rsi    1245:   bf 01 00 00 00          mov    $0x1,%edi    124a:   31 c0                   xor    %eax,%eax    124c:   48 83 c3 04             add    $0x4,%rbx    1250:   e8 3b fe ff ff          callq  1090 <__printf_chk@plt>    1255:   49 39 dc                cmp    %rbx,%r12    1258:   75 e6                   jne    1240 <main+0x1a0>    125a:   bf 0a 00 00 00          mov    $0xa,%edi    125f:   e8 0c fe ff ff          callq  1070 <putchar@plt>    1264:   48 8b 44 24 58          mov    0x58(%rsp),%rax    1269:   64 48 33 04 25 28 00    xor    %fs:0x28,%rax    1270:   00 00     1272:   75 0b                   jne    127f <main+0x1df>    1274:   48 83 c4 60             add    $0x60,%rsp    1278:   31 c0                   xor    %eax,%eax    127a:   5b                      pop    %rbx    127b:   5d                      pop    %rbp    127c:   41 5c                   pop    %r12    127e:   c3                      retq       127f:   e8 fc fd ff ff          callq  1080 <__stack_chk_fail@plt>    1284:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)    128b:   00 00 00     128e:   66 90                   xchg   %ax,%ax

However I'm not asm-savvy enough to figure out the problem.

compiler version and cpu flags:

khaotik@KKST2:~/tmp$ gcc --versiongcc (Ubuntu 9.3.0-10ubuntu2) 9.3.0Copyright (C) 2019 Free Software Foundation, Inc.This is free software; see the source for copying conditions.  There is NOwarranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.khaotik@KKST2:~/tmp$ lscpu | grep FlagsFlags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d

Viewing all articles
Browse latest Browse all 22040

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>