Quantcast
Channel: Active questions tagged gcc - Stack Overflow
Viewing all articles
Browse latest Browse all 22016

Why is _mm512_store_pd super slow in this matrix multiplication code?

$
0
0

I'm playing with avx512 and matrix multiplications but I must be doing something wrong because I have awful performances when I try to store my results using _mm512_store_pd.

Here are the relevant snippets of code, first the data structure I'm using and how I initialize it:


typedef struct {
        double* values;
        int nb_l;
        int nb_c;
} matrix;

matrix* alloc_matrix(int nb_l, int nb_c){
        matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
        tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
        tmp_matrix->nb_l = nb_l;
        tmp_matrix->nb_c = nb_c;
        return tmp_matrix;
}

And here is how I'm trying to multiply two matrices initialized elsewhere in my code:

matrix* mult_matrix(matrix* A, matrix* B){
        /* avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        __m512d A_broadcast, B_l_8, res_ptr_8;
        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                // broadcast current value of A  eight times
                A_broadcast = _mm512_set1_pd(A->values[idx_A]);
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
                        B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
                        res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
                        _mm512_store_pd(
                                &res_ptr[offset_B] , 
                                _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8)
                                );
                }
        }
        return res_matrix;

The results are OK but _mm512_store_pd takes ~90% of the execution time, actually this avx512 code is barely faster than its non avx version.

I've tried everything I could think of but I'm can't find why I have such disappointing performances with this code. Do you have any idea ?

Thanks.

EDIT 1

Here is the non avx code

        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c; 
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c; 
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){                    
                        res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
                }
        }
        return res_matrix;



All matrices are 512x512 random matrices, each multiplication is repeated 50 times and the running time is averaged.

Finally the snippet below should be OK in order to test avx and non_avx versions of my code. I've compiled it with gcc 8.3.0 using the following options: gcc -Ofast -mavx -mavx512f -m64 -mfpmath=sse -mfma -flto -funroll-loops matrix_minimal.c

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>

typedef struct {
        double* values;
        int nb_l;
        int nb_c;
} matrix;

matrix* alloc_matrix(int nb_l, int nb_c){
        matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
        tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
        tmp_matrix->nb_l = nb_l;
        tmp_matrix->nb_c = nb_c;
        return tmp_matrix;
}

void free_matrix(matrix** to_free){
        free((*to_free)->values);
        free(*to_free);
}

matrix* zero_matrix(int nb_l, int nb_c){
        matrix* z_matrix;
        z_matrix = alloc_matrix(nb_l, nb_c);
        for (int idx=0; idx < nb_l * nb_c; idx++){
                z_matrix->values[idx] = 0.0;
        }
        return z_matrix;
}
matrix* rand_matrix(int nb_l, int nb_c, double max_abs_val){
        static struct timeval seed; //static variables are zeroed at initialization
        matrix* rnd_matrix;
        rnd_matrix = alloc_matrix(nb_l, nb_c);

        if (seed.tv_sec == 0){ //ts_sec will never be zero after gettimeofday, whereas tv_usec could
                gettimeofday(&seed, NULL);
                srand((unsigned) seed.tv_usec);
        }
        for (int idx=0; idx < nb_l * nb_c; idx++){
                rnd_matrix->values[idx] = max_abs_val * ((double)rand() / RAND_MAX * 2.0 - 1.0);
        }

        return rnd_matrix;
}

matrix* mult_matrix_avx(matrix* A, matrix* B){
        /* pas trop mal en avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        __m512d A_broadcast, B_l_8, res_ptr_8;
        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                A_broadcast = _mm512_set1_pd(A->values[idx_A]); // broadcast current value of A eight times
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
                        B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
                        res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
                        _mm512_store_pd(&res_ptr[offset_B] , _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8));
                }
        }
        return res_matrix;
}

matrix* mult_matrix(matrix* A, matrix* B){
        /* non avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){
                        res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
                }
        }
        return res_matrix;
}
int main(int argc, char *argv[]){
        struct timeval before;
        struct timeval after;

        matrix* A = rand_matrix(512, 512, 5);
        matrix* B = rand_matrix(512, 512, 5);
        matrix *C;
        gettimeofday(&before, NULL);
        for (int j=0; j<50;j++){
                C = mult_matrix_avx(A, B);
                free_matrix(&C); // we will measure the same overhead here and in the non avx version
        }
        gettimeofday(&after, NULL);
        double delta = ((after.tv_sec - before.tv_sec) * 1000000 +
                (after.tv_usec - before.tv_usec))/50;
        printf("avx %lf ms\n", delta);
        gettimeofday(&before, NULL);
        for (int j=0; j<50;j++){
                C = mult_matrix(A, B);
                free_matrix(&C); 
        }
        gettimeofday(&after, NULL);
        delta = ((after.tv_sec - before.tv_sec) * 1000000 +
                (after.tv_usec - before.tv_usec))/50;
        printf("non avx %lf ms\n", delta);

        free_matrix(&A);
        free_matrix(&B);
        return 0;
}


Viewing all articles
Browse latest Browse all 22016

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>