GCC -O0 Generating Weird AVX extra store/reload instructions with intrinsics [duplicate]

I have created a simple, vectorized C function to square each element in an array. The code is as follows:

#include <immintrin.h>void square(const double* arr, uint len, double* outarr) {    __m256d v;    for (uint i = 0; i <= len - 4; i += 4) {        v = _mm256_load_pd(&arr[i]);        _mm256_stream_pd(&outarr[i], _mm256_mul_pd(v, v));    }    for (uint i = len-(len&3u); i < len; i++) {        outarr[i] = arr[i]*arr[i];    }}int main() {    double* inp = aligned_alloc(32, 100* sizeof(double));    double* out = aligned_alloc(32, 100* sizeof(double));    square(inp, 100u, out);    return 0;}

When I compile this code with:

gcc main.c -mavx -o main

I get the following disassembly for the square function:

   0x0000000000400546 <+0>:     lea    0x8(%rsp),%r10   0x000000000040054b <+5>:     and    $0xffffffffffffffe0,%rsp   0x000000000040054f <+9>:     pushq  -0x8(%r10)   0x0000000000400553 <+13>:    push   %rbp   0x0000000000400554 <+14>:    mov    %rsp,%rbp   0x0000000000400557 <+17>:    push   %r10   0x0000000000400559 <+19>:    sub    $0x50,%rsp   0x000000000040055d <+23>:    mov    %rdi,-0xb8(%rbp)   0x0000000000400564 <+30>:    mov    %esi,-0xbc(%rbp)   0x000000000040056a <+36>:    mov    %rdx,-0xc8(%rbp)   0x0000000000400571 <+43>:    movl   $0x0,-0xa8(%rbp)   0x000000000040057b <+53>:    jmpq   0x400615 <square+207>   0x0000000000400580 <+58>:    mov    -0xa8(%rbp),%eax   0x0000000000400586 <+64>:    cltq      0x0000000000400588 <+66>:    lea    0x0(,%rax,8),%rdx   0x0000000000400590 <+74>:    mov    -0xb8(%rbp),%rax   0x0000000000400597 <+81>:    add    %rdx,%rax   0x000000000040059a <+84>:    mov    %rax,-0xa0(%rbp)   0x00000000004005a1 <+91>:    mov    -0xa0(%rbp),%rax   0x00000000004005a8 <+98>:    vmovapd (%rax),%ymm0   0x00000000004005ac <+102>:   vmovapd %ymm0,-0x90(%rbp)   0x00000000004005b4 <+110>:   vmovapd -0x90(%rbp),%ymm0   0x00000000004005bc <+118>:   vmovapd %ymm0,-0x70(%rbp)   0x00000000004005c1 <+123>:   vmovapd -0x90(%rbp),%ymm0   0x00000000004005c9 <+131>:   vmovapd %ymm0,-0x30(%rbp)   0x00000000004005ce <+136>:   vmovapd -0x70(%rbp),%ymm0   0x00000000004005d3 <+141>:   vmulpd -0x30(%rbp),%ymm0,%ymm0   0x00000000004005d8 <+146>:   mov    -0xa8(%rbp),%eax   0x00000000004005de <+152>:   cltq      0x00000000004005e0 <+154>:   lea    0x0(,%rax,8),%rdx   0x00000000004005e8 <+162>:   mov    -0xc8(%rbp),%rax   0x00000000004005ef <+169>:   add    %rdx,%rax   0x00000000004005f2 <+172>:   mov    %rax,-0x98(%rbp)   0x00000000004005f9 <+179>:   vmovapd %ymm0,-0x50(%rbp)   0x00000000004005fe <+184>:   mov    -0x98(%rbp),%rax   0x0000000000400605 <+191>:   vmovapd -0x50(%rbp),%ymm0   0x000000000040060a <+196>:   vmovntpd %ymm0,(%rax)   0x000000000040060e <+200>:   addl   $0x4,-0xa8(%rbp)   0x0000000000400615 <+207>:   mov    -0xbc(%rbp),%eax   0x000000000040061b <+213>:   lea    -0x4(%rax),%edx   0x000000000040061e <+216>:   mov    -0xa8(%rbp),%eax   0x0000000000400624 <+222>:   cmp    %eax,%edx   0x0000000000400626 <+224>:   jae    0x400580 <square+58>   0x000000000040062c <+230>:   mov    -0xbc(%rbp),%eax   0x0000000000400632 <+236>:   and    $0xfffffffc,%eax   0x0000000000400635 <+239>:   mov    %eax,-0xa4(%rbp)   0x000000000040063b <+245>:   jmp    0x40069c <square+342>   0x000000000040063d <+247>:   mov    -0xa4(%rbp),%eax   0x0000000000400643 <+253>:   lea    0x0(,%rax,8),%rdx   0x000000000040064b <+261>:   mov    -0xc8(%rbp),%rax   0x0000000000400652 <+268>:   add    %rdx,%rax

Then, when I compile the code with:

gcc main.c -mavx -O3 -o main

I get the following disassembly for the square function:

   0x00000000004005c0 <+0>:     lea    -0x4(%rsi),%r9d   0x00000000004005c4 <+4>:     mov    %rdi,%r8   0x00000000004005c7 <+7>:     mov    %rdx,%rcx   0x00000000004005ca <+10>:    xor    %eax,%eax   0x00000000004005cc <+12>:    nopl   0x0(%rax)   0x00000000004005d0 <+16>:    vmovapd (%r8),%ymm0   0x00000000004005d5 <+21>:    add    $0x4,%eax   0x00000000004005d8 <+24>:    add    $0x20,%r8   0x00000000004005dc <+28>:    add    $0x20,%rcx   0x00000000004005e0 <+32>:    vmulpd %ymm0,%ymm0,%ymm0   0x00000000004005e4 <+36>:    vmovntpd %ymm0,-0x20(%rcx)   0x00000000004005e9 <+41>:    cmp    %eax,%r9d   0x00000000004005ec <+44>:    jae    0x4005d0 <square+16>   0x00000000004005ee <+46>:    mov    %esi,%eax   0x00000000004005f0 <+48>:    and    $0xfffffffc,%eax   0x00000000004005f3 <+51>:    cmp    %eax,%esi   0x00000000004005f5 <+53>:    jbe    0x400617 <square+87>   0x00000000004005f7 <+55>:    nopw   0x0(%rax,%rax,1)   0x0000000000400600 <+64>:    mov    %eax,%ecx   0x0000000000400602 <+66>:    add    $0x1,%eax   0x0000000000400605 <+69>:    vmovsd (%rdi,%rcx,8),%xmm0   0x000000000040060a <+74>:    cmp    %eax,%esi   0x000000000040060c <+76>:    vmulsd %xmm0,%xmm0,%xmm0   0x0000000000400610 <+80>:    vmovsd %xmm0,(%rdx,%rcx,8)   0x0000000000400615 <+85>:    jne    0x400600 <square+64>   0x0000000000400617 <+87>:    vzeroupper    0x000000000040061a <+90>:    retq

My GCC version is:

gcc (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609Copyright (C) 2015 Free Software Foundation, Inc.This is free software; see the source for copying conditions.  There is NOwarranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

I was just wondering if someone could explain what GCC is doing in the first code snippet. There seems to be a bunch of random vmovapd instructions with no particular purpose. I.e instruction <+110> seems to be useless as the contents of %ymm0 are the same as -0x90(%rbp)? I can understand what the -O3 code does but I'm confused about the non-optimized code. Thank you.

GCC -O0 Generating Weird AVX extra store/reload instructions with intrinsics [duplicate]

Trending Articles

Practice Sheet of Right form of verbs for HSC Students

Download: FK ft Shenky – Nakuyewa ”Prod by: Shenky”

How to win at Markstrat (Markstrat Tips and Tricks) – Vodites

Ominde Commission Report and Recommendations – Ominde Report of 1964

Bureau of Internal Revenue: Regional Offices (Directory)

GO 53 on Enhancement of Ex-gratia upto 5 Lakhs Toddy Tappers in Telangana

Cakewalk CA-2A Leveling Amplifier v2.0.1.97 WiN, v2.0.1.96 OSX Incl Keygen

Mp3 Download: Mdu - Kunjenjenjena

How the kill the job , when DTP request running for long hours.

Microsoft Intune から展開しているアプリのアップデートについて

18-year-old girl was beaten for half an hour by two Northampton men in 'an...

Car crash in Dunton Bassett leaves driver in critical condition

Macky 2, Two Others In Road Accident

Application log 00000000000000089514: Could not convert queue DLVST90CLNT

Detroit mafia: D’Anna Brothers agree to plea deal

Delivery block field greyed out using VA02

Muloraki Au

【個人撮影】スマホのプライベート映像♪「中に出さないで///」カラオケ屋での生ハメ撮りが流出ｗ【リベンジポルノ】＠PornHub

BREAKING NEWS: Diamond Platnumz Is Reported Dead After Ghastly Car Accident

FIAT 500 B0111 B0112