[AArch64] Optimized implementation of search_line_fast for the CPP lexer

* lex.c (search_line_fast): New implementation for AArch64.

From-SVN: r241964
This commit is contained in:
Richard Earnshaw 2016-11-08 13:29:32 +00:00 committed by Richard Earnshaw
parent 1f069ef5a1
commit a6ac871cdf
2 changed files with 99 additions and 0 deletions

View file

@ -1,3 +1,7 @@
2016-11-08 Richard Earnshaw <rearnsha@arm.com>
* lex.c (search_line_fast): New implementation for AArch64.
2016-10-25 David Malcolm <dmalcolm@redhat.com>
* files.c (destroy_cpp_file): Free file->path.

View file

@ -752,6 +752,101 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
}
}
#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
#include "arm_neon.h"
/* This doesn't have to be the exact page size, but no system may use
a size smaller than this. ARMv8 requires a minimum page size of
4k. The impact of being conservative here is a small number of
cases will take the slightly slower entry path into the main
loop. */
#define AARCH64_MIN_PAGE_SIZE 4096
static const uchar *
search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
const uint8x16_t repl_qm = vdupq_n_u8 ('?');
const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
#ifdef __AARCH64EB
const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
#else
const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
#endif
unsigned int found;
const uint8_t *p;
uint8x16_t data;
uint8x16_t t;
uint16x8_t m;
uint8x16_t u, v, w;
/* Align the source pointer. */
p = (const uint8_t *)((uintptr_t)s & -16);
/* Assuming random string start positions, with a 4k page size we'll take
the slow path about 0.37% of the time. */
if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
- (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
< 16, 0))
{
/* Slow path: the string starts near a possible page boundary. */
uint32_t misalign, mask;
misalign = (uintptr_t)s & 15;
mask = (-1u << misalign) & 0xffff;
data = vld1q_u8 (p);
t = vceqq_u8 (data, repl_nl);
u = vceqq_u8 (data, repl_cr);
v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
t = vorrq_u8 (v, w);
t = vandq_u8 (t, xmask);
m = vpaddlq_u8 (t);
m = vshlq_u16 (m, shift);
found = vaddvq_u16 (m);
found &= mask;
if (found)
return (const uchar*)p + __builtin_ctz (found);
}
else
{
data = vld1q_u8 ((const uint8_t *) s);
t = vceqq_u8 (data, repl_nl);
u = vceqq_u8 (data, repl_cr);
v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
t = vorrq_u8 (v, w);
if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
goto done;
}
do
{
p += 16;
data = vld1q_u8 (p);
t = vceqq_u8 (data, repl_nl);
u = vceqq_u8 (data, repl_cr);
v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
t = vorrq_u8 (v, w);
} while (!vpaddd_u64 ((uint64x2_t)t));
done:
/* Now that we've found the terminating substring, work out precisely where
we need to stop. */
t = vandq_u8 (t, xmask);
m = vpaddlq_u8 (t);
m = vshlq_u16 (m, shift);
found = vaddvq_u16 (m);
return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
+ __builtin_ctz (found));
}
#elif defined (__ARM_NEON)
#include "arm_neon.h"