Quantcast
Channel: Active questions tagged gcc - Stack Overflow
Viewing all articles
Browse latest Browse all 22250

Intel Compiler vs GCC code generation differences

$
0
0

I'm learning about x64 programming and the differences between Intel C++ compiler and GCC and how they optimise the instructions

Questions:

  1. What's the best way to tell Intel Compiler to dump the assembly code (similar to gcc -S) ?Right now I debug and disassemble in Visual Studio to see the instructions.

  2. the disassembled Intel compiled psum1 doesn't obey the convention of argument passing on register rdi, rsi, rdx, rcx, r8, r9 like what can be seen in GCC assembler output. What am I missing here?

  3. for some reason, intel compiler doesn't optimize away memory access, what settings do i need to change?

                //intel compiler /Ox output            p[i] = p[i-1] + a[i];            000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]            000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]            000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0            000000013F79119D  addss       xmm0,xmm1            000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0//GCC -O3 ouputLBB1_3:decq    %rdxLBB1_2:addq    $4, %rsi        addq    $4, %rdi        addss   (%rdi), %xmm0        movss   %xmm0, (%rsi)testq   %rdx, %rdx        jne LBB1_3        LBB1_4:

Original C code

void psum1( float a[], float p[], long int n ) {    long int i;    p[0] = a[0];    for (i=1; i<n; i++) {        p[i] = p[i-1] + a[i];    }}

Disassembly from Intel C++ Compiler 2013 on Visual Studio 2010:

  • Full optimisation /Ox
  • Enable Intrinsic function /Oi
  • Favor speed /Ot

    void psum1( float a[], float p[], long int n ) { long int i;

    p[0] = a[0];000000013F791156  movss       xmm0,dword ptr [rcx]000000013F79115A  mov         dword ptr [rdx],eaxfor( i=1; i<n; i++ ) {    000000013F79115C  jle         psum1+7Ah (13F7911CAh)    000000013F79115E  mov         eax,1    000000013F791163  lea         r10d,[r8-1]    000000013F791167  mov         r11d,r10d    000000013F79116A  xor         r9d,r9d    000000013F79116D  shr         r11d,1Fh    000000013F791171  lea         r8d,[r11+r8-1]    000000013F791176  sar         r8d,1    000000013F791179  test        r8d,r8d    000000013F79117C  jbe         psum1+5Eh (13F7911AEh)    p[i] = p[i-1] + a[i];    000000013F79117E  lea         eax,[r9+r9]    for( i=1; i<n; i++ ) {        000000013F791182  inc         r9d        p[i] = p[i-1] + a[i];        000000013F791185  movsxd      rax,eax        for( i=1; i<n; i++ ) {            000000013F791188  cmp         r9d,r8d            p[i] = p[i-1] + a[i];            000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]            000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]            000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0            000000013F79119D  addss       xmm0,xmm1            000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0            for( i=1; i<n; i++ ) {                000000013F7911A7  jb          psum1+2Eh (13F79117Eh)                000000013F7911A9  lea         eax,[r9+r9+1]                000000013F7911AE  lea         r8d,[rax-1]                000000013F7911B2  cmp         r10d,r8d                000000013F7911B5  jbe         psum1+7Ah (13F7911CAh)                p[i] = p[i-1] + a[i];                000000013F7911B7  movsxd      rax,eax                000000013F7911BA  movss       xmm0,dword ptr [rdx+rax*4-4]                000000013F7911C0  addss       xmm0,dword ptr [rcx+rax*4]                000000013F7911C5  movss       dword ptr [rdx+rax*4],xmm0            }        }        000000013F7911CA  ret        000000013F7911CB  nop         dword ptr [rax+rax]

GCC Assembly output with full optimisation -O3

.section    __TEXT,__text,regular,pure_instructions.globl  _psum1.align  4, 0x90_psum1:Leh_func_begin1:pushq   %rbp        Ltmp0:movq    %rsp, %rbp        Ltmp1:movss   (%rdi), %xmm0        movss   %xmm0, (%rsi)cmpq    $2, %rdx        jl  LBB1_4        addq    $-2, %rdx        jmp LBB1_2.align  4, 0x90LBB1_3:decq    %rdxLBB1_2:addq    $4, %rsi        addq    $4, %rdi        addss   (%rdi), %xmm0        movss   %xmm0, (%rsi)testq   %rdx, %rdx        jne LBB1_3LBB1_4:popq    %rbp        retLeh_func_end1:

Viewing all articles
Browse latest Browse all 22250

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>