I'm learning about x64 programming and the differences between Intel C++ compiler and GCC and how they optimise the instructions
Questions:
What's the best way to tell Intel Compiler to dump the assembly code (similar to gcc -S) ?Right now I debug and disassemble in Visual Studio to see the instructions.
the disassembled Intel compiled psum1 doesn't obey the convention of argument passing on register rdi, rsi, rdx, rcx, r8, r9 like what can be seen in GCC assembler output. What am I missing here?
for some reason, intel compiler doesn't optimize away memory access, what settings do i need to change?
//intel compiler /Ox output p[i] = p[i-1] + a[i]; 000000013F79118B movss xmm1,dword ptr [rcx+rax*4+8] 000000013F791191 addss xmm0,dword ptr [rcx+rax*4+4] 000000013F791197 movss dword ptr [rdx+rax*4+4],xmm0 000000013F79119D addss xmm0,xmm1 000000013F7911A1 movss dword ptr [rdx+rax*4+8],xmm0//GCC -O3 ouputLBB1_3:decq %rdxLBB1_2:addq $4, %rsi addq $4, %rdi addss (%rdi), %xmm0 movss %xmm0, (%rsi)testq %rdx, %rdx jne LBB1_3 LBB1_4:
Original C code
void psum1( float a[], float p[], long int n ) { long int i; p[0] = a[0]; for (i=1; i<n; i++) { p[i] = p[i-1] + a[i]; }}
Disassembly from Intel C++ Compiler 2013 on Visual Studio 2010:
- Full optimisation /Ox
- Enable Intrinsic function /Oi
Favor speed /Ot
void psum1( float a[], float p[], long int n ) { long int i;
p[0] = a[0];000000013F791156 movss xmm0,dword ptr [rcx]000000013F79115A mov dword ptr [rdx],eaxfor( i=1; i<n; i++ ) { 000000013F79115C jle psum1+7Ah (13F7911CAh) 000000013F79115E mov eax,1 000000013F791163 lea r10d,[r8-1] 000000013F791167 mov r11d,r10d 000000013F79116A xor r9d,r9d 000000013F79116D shr r11d,1Fh 000000013F791171 lea r8d,[r11+r8-1] 000000013F791176 sar r8d,1 000000013F791179 test r8d,r8d 000000013F79117C jbe psum1+5Eh (13F7911AEh) p[i] = p[i-1] + a[i]; 000000013F79117E lea eax,[r9+r9] for( i=1; i<n; i++ ) { 000000013F791182 inc r9d p[i] = p[i-1] + a[i]; 000000013F791185 movsxd rax,eax for( i=1; i<n; i++ ) { 000000013F791188 cmp r9d,r8d p[i] = p[i-1] + a[i]; 000000013F79118B movss xmm1,dword ptr [rcx+rax*4+8] 000000013F791191 addss xmm0,dword ptr [rcx+rax*4+4] 000000013F791197 movss dword ptr [rdx+rax*4+4],xmm0 000000013F79119D addss xmm0,xmm1 000000013F7911A1 movss dword ptr [rdx+rax*4+8],xmm0 for( i=1; i<n; i++ ) { 000000013F7911A7 jb psum1+2Eh (13F79117Eh) 000000013F7911A9 lea eax,[r9+r9+1] 000000013F7911AE lea r8d,[rax-1] 000000013F7911B2 cmp r10d,r8d 000000013F7911B5 jbe psum1+7Ah (13F7911CAh) p[i] = p[i-1] + a[i]; 000000013F7911B7 movsxd rax,eax 000000013F7911BA movss xmm0,dword ptr [rdx+rax*4-4] 000000013F7911C0 addss xmm0,dword ptr [rcx+rax*4] 000000013F7911C5 movss dword ptr [rdx+rax*4],xmm0 } } 000000013F7911CA ret 000000013F7911CB nop dword ptr [rax+rax]
GCC Assembly output with full optimisation -O3
.section __TEXT,__text,regular,pure_instructions.globl _psum1.align 4, 0x90_psum1:Leh_func_begin1:pushq %rbp Ltmp0:movq %rsp, %rbp Ltmp1:movss (%rdi), %xmm0 movss %xmm0, (%rsi)cmpq $2, %rdx jl LBB1_4 addq $-2, %rdx jmp LBB1_2.align 4, 0x90LBB1_3:decq %rdxLBB1_2:addq $4, %rsi addq $4, %rdi addss (%rdi), %xmm0 movss %xmm0, (%rsi)testq %rdx, %rdx jne LBB1_3LBB1_4:popq %rbp retLeh_func_end1: