I'm trying to validate a string that must only contain ASCII visible characters, white space and \t.
But it seems that ASCII table lookups are faster than the _mm_cmpestri instruction with _SIDD_CMP_RANGES on most CPUs. I've tested it on an i5-2410M, an i7-3720QM, an i7-5600U and a KVM-virtualized Xeon of unknown type and only on the last one is the vectorized version faster.
My test code is here:
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <x86intrin.h>
#ifdef _MSC_VER
#define ALIGNED16 _declspec(align(16))
#else
#define ALIGNED16 __attribute__((aligned(16)))
#endif
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv, NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv, NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,
#define NO 0,
#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
static const ALIGNED16 int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * + , - . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// @ A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
size_t search_table(const char* data, size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
while (len - i >= 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
while (VWSCHAR(data[i++]))
;
--i;
return i;
}
size_t search_sse4cmpstr(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
'', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
while (VWSCHAR(data[i++]))
;
--i;
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
for (size_t i = 0; i < len; ++i) {
data[i] = 'a' + (i % 26);
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table checking", {
size_t i = search_table(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u", i, end);
});
MEASURE("cmpstr ranges", {
size_t i = search_sse4cmpstr(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u", i, end);
});
}
Compiled with g++ -std=gnu++11 -pedantic -Wall -Wextra -O3 -march=native main.cpp
or gcc -pedantic -Wall -Wextra -O3 -march=native main2.cpp
it gives me these results:
Setting up 1GB of data...
table checking - 482.322ms
cmpstr ranges - 543.838ms
I've also checked the assembly output and search_sse4cmpstr uses vpcmpestri while search_table is non-vectorized.
Am I using it wrong? Or why does this instruction exist at all?