Quantcast
Channel: Active questions tagged gcc - Stack Overflow
Viewing all articles
Browse latest Browse all 22169

Why is SSE4.2 cmpstr slower than regular code?

$
0
0

I'm trying to validate a string that must only contain ASCII visible characters, white space and \t.

But it seems that ASCII table lookups are faster than the _mm_cmpestri instruction with _SIDD_CMP_RANGES on most CPUs. I've tested it on an i5-2410M, an i7-3720QM, an i7-5600U and a KVM-virtualized Xeon of unknown type and only on the last one is the vectorized version faster.

My test code is here:

#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <x86intrin.h>

#ifdef _MSC_VER
    #define ALIGNED16 _declspec(align(16))
#else
    #define ALIGNED16 __attribute__((aligned(16)))
#endif

#define MEASURE(msg,stmt) { \
    struct timeval tv; \
    gettimeofday(&tv, NULL); \
    uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
    stmt; \
    gettimeofday(&tv, NULL); \
    uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
    printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}

// Character table
#define VWSCHAR(c)  (vis_ws_chars[(unsigned char)(c)])   // Visible characters and white space
#define YES     1,
#define NO      0,
#define YES16   YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16    NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128   NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16

// Visible ASCII characters with space and tab
static const ALIGNED16 int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS  HT  LF  VT  FF  CR  SO  SI
   NO  NO  NO  NO  NO  NO  NO  NO  NO  YES NO  NO  NO  NO  NO  NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM  SUB ESC FS  GS  RS  US
   NO16
// SP  !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
// 0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
// @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
// P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
// `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
   YES16 YES16 YES16 YES16 YES16
// p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~   DEL
   YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
   NO128
};

size_t search_table(const char* data, size_t len)
{
    // Search non-matching character via table lookups
    size_t i = 0;
    while (len - i >= 16) {
        if (!VWSCHAR(data[i + 0])) break;
        if (!VWSCHAR(data[i + 1])) break;
        if (!VWSCHAR(data[i + 2])) break;
        if (!VWSCHAR(data[i + 3])) break;
        if (!VWSCHAR(data[i + 4])) break;
        if (!VWSCHAR(data[i + 5])) break;
        if (!VWSCHAR(data[i + 6])) break;
        if (!VWSCHAR(data[i + 7])) break;
        if (!VWSCHAR(data[i + 8])) break;
        if (!VWSCHAR(data[i + 9])) break;
        if (!VWSCHAR(data[i + 10])) break;
        if (!VWSCHAR(data[i + 11])) break;
        if (!VWSCHAR(data[i + 12])) break;
        if (!VWSCHAR(data[i + 13])) break;
        if (!VWSCHAR(data[i + 14])) break;
        if (!VWSCHAR(data[i + 15])) break;
        i += 16;
    }
    while (VWSCHAR(data[i++]))
        ;
    --i;
    return i;
}

size_t search_sse4cmpstr(const char* data, size_t len)
{
    static const char legal_ranges[16] = {
        '\t', '\t',
        '',  '~',
    };
    __m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
    size_t i = 0;
    while (len - i >= 16) {
        __m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
        unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
        i += consumed;
        if (consumed < 16) {
            return i;
        }
    }
    while (VWSCHAR(data[i++]))
        ;
    --i;
    return i;
}

int main()
{
    printf("Setting up 1GB of data...\n");
    size_t len = 1024 * 1024 * 1024 + 3;
    char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
    for (size_t i = 0; i < len; ++i) {
        data[i] = 'a' + (i % 26);
    }
    size_t end = len - 2;
    data[end] = '\n'; // Illegal character to be found

    MEASURE("table checking", {
        size_t i = search_table(data, len);
        if (i != end) printf("INCORRECT RESULT: %u instead of %u", i, end);
    });
    MEASURE("cmpstr ranges", {
        size_t i = search_sse4cmpstr(data, len);
        if (i != end) printf("INCORRECT RESULT: %u instead of %u", i, end);
    });
}

Compiled with g++ -std=gnu++11 -pedantic -Wall -Wextra -O3 -march=native main.cpp or gcc -pedantic -Wall -Wextra -O3 -march=native main2.cpp it gives me these results:

Setting up 1GB of data...
table checking       - 482.322ms
cmpstr ranges        - 543.838ms

I've also checked the assembly output and search_sse4cmpstr uses vpcmpestri while search_table is non-vectorized.

Am I using it wrong? Or why does this instruction exist at all?


Viewing all articles
Browse latest Browse all 22169

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>