Why is _mm512_store_pd super slow in this matrix multiplication code?

I'm playing with avx512 and matrix multiplications but I must be doing something wrong because I have awful performances when I try to store my results using _mm512_store_pd.

Here are the relevant snippets of code, first the data structure I'm using and how I initialize it:


typedef struct {
        double* values;
        int nb_l;
        int nb_c;
} matrix;

matrix* alloc_matrix(int nb_l, int nb_c){
        matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
        tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
        tmp_matrix->nb_l = nb_l;
        tmp_matrix->nb_c = nb_c;
        return tmp_matrix;
}

And here is how I'm trying to multiply two matrices initialized elsewhere in my code:

matrix* mult_matrix(matrix* A, matrix* B){
        /* avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        __m512d A_broadcast, B_l_8, res_ptr_8;
        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                // broadcast current value of A  eight times
                A_broadcast = _mm512_set1_pd(A->values[idx_A]);
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
                        B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
                        res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
                        _mm512_store_pd(
                                &res_ptr[offset_B] , 
                                _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8)
                                );
                }
        }
        return res_matrix;

The results are OK but _mm512_store_pd takes ~90% of the execution time, actually this avx512 code is barely faster than its non avx version.

I've tried everything I could think of but I'm can't find why I have such disappointing performances with this code. Do you have any idea ?

Thanks.

EDIT 1

Here is the non avx code

        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c; 
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c; 
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){                    
                        res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
                }
        }
        return res_matrix;

All matrices are 512x512 random matrices, each multiplication is repeated 50 times and the running time is averaged.

Finally the snippet below should be OK in order to test avx and non_avx versions of my code. I've compiled it with gcc 8.3.0 using the following options: gcc -Ofast -mavx -mavx512f -m64 -mfpmath=sse -mfma -flto -funroll-loops matrix_minimal.c

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>

typedef struct {
        double* values;
        int nb_l;
        int nb_c;
} matrix;

matrix* alloc_matrix(int nb_l, int nb_c){
        matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
        tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
        tmp_matrix->nb_l = nb_l;
        tmp_matrix->nb_c = nb_c;
        return tmp_matrix;
}

void free_matrix(matrix** to_free){
        free((*to_free)->values);
        free(*to_free);
}

matrix* zero_matrix(int nb_l, int nb_c){
        matrix* z_matrix;
        z_matrix = alloc_matrix(nb_l, nb_c);
        for (int idx=0; idx < nb_l * nb_c; idx++){
                z_matrix->values[idx] = 0.0;
        }
        return z_matrix;
}
matrix* rand_matrix(int nb_l, int nb_c, double max_abs_val){
        static struct timeval seed; //static variables are zeroed at initialization
        matrix* rnd_matrix;
        rnd_matrix = alloc_matrix(nb_l, nb_c);

        if (seed.tv_sec == 0){ //ts_sec will never be zero after gettimeofday, whereas tv_usec could
                gettimeofday(&seed, NULL);
                srand((unsigned) seed.tv_usec);
        }
        for (int idx=0; idx < nb_l * nb_c; idx++){
                rnd_matrix->values[idx] = max_abs_val * ((double)rand() / RAND_MAX * 2.0 - 1.0);
        }

        return rnd_matrix;
}

matrix* mult_matrix_avx(matrix* A, matrix* B){
        /* pas trop mal en avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        __m512d A_broadcast, B_l_8, res_ptr_8;
        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                A_broadcast = _mm512_set1_pd(A->values[idx_A]); // broadcast current value of A eight times
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
                        B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
                        res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
                        _mm512_store_pd(&res_ptr[offset_B] , _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8));
                }
        }
        return res_matrix;
}

matrix* mult_matrix(matrix* A, matrix* B){
        /* non avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){
                        res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
                }
        }
        return res_matrix;
}
int main(int argc, char *argv[]){
        struct timeval before;
        struct timeval after;

        matrix* A = rand_matrix(512, 512, 5);
        matrix* B = rand_matrix(512, 512, 5);
        matrix *C;
        gettimeofday(&before, NULL);
        for (int j=0; j<50;j++){
                C = mult_matrix_avx(A, B);
                free_matrix(&C); // we will measure the same overhead here and in the non avx version
        }
        gettimeofday(&after, NULL);
        double delta = ((after.tv_sec - before.tv_sec) * 1000000 +
                (after.tv_usec - before.tv_usec))/50;
        printf("avx %lf ms\n", delta);
        gettimeofday(&before, NULL);
        for (int j=0; j<50;j++){
                C = mult_matrix(A, B);
                free_matrix(&C); 
        }
        gettimeofday(&after, NULL);
        delta = ((after.tv_sec - before.tv_sec) * 1000000 +
                (after.tv_usec - before.tv_usec))/50;
        printf("non avx %lf ms\n", delta);

        free_matrix(&A);
        free_matrix(&B);
        return 0;
}

Why is _mm512_store_pd super slow in this matrix multiplication code?

Trending Articles

Bath man appears in court charged with attempted murder of a man...

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Police blotter for Jan. 12

99 God Status for Whatsapp, Facebook

Rajasthan Board 12th Science Result 2018 name wise- RBSE 12th commerce result...

Notorious Naushad of Ippa gang nabbed

Child Kidnapping: Amy McNeil was kidnapped on her way to school by 5 adults;...

Sonible Smartlimit v1.1.5-R2R

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Arrow Flash 2 – Sinhala Dubbed – Episode 23 – 20th March 2016

[GET] AI Traffic Goldmine

[E² Plugin] HDF-Radio

Universal Multi-Patch v1.3 By RADIXX11

IWAN – Thanks and Praise ( Throw Back Thursday )

RONALD P SONDERGAARD Arrested by Miami-Dade County Corrections on Mar 03, 2017

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

HSSC Excise & Taxation Inspector Result 2017 Scorecard/ Category Wise Merit List