# unrolled LCP ``` template <typename T_idx_> inline T_idx_ Suffix_Array<T_idx_>::lcp_opt_avx(const char* str1, const char* str2, const idx_t len_in) { int64_t i = 0; int64_t len = static_cast<int64_t>(len_in); if ((len - i)>= 256) { for (; i <= len - 256; i += 256) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+64); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+96); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+128); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+160))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+160))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+160); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+192))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+192))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+192); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+224))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+224))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+224); return static_cast<idx_t>(j); } } } if ((len - i)>= 224) { for (; i <= len - 224; i += 224) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+64); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+96); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+128); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+160))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+160))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+160); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+192))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+192))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+192); return static_cast<idx_t>(j); } } } if ((len - i)>= 192) { for (; i <= len - 192; i += 192) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+64); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+96); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+128); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+160))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+160))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+160); return static_cast<idx_t>(j); } } } if ((len - i)>= 160) { for (; i <= len - 160; i += 160) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+64); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+96); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+128); return static_cast<idx_t>(j); } } } if ((len - i)>= 128) { for (; i <= len - 128; i += 128) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+64); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+96); return static_cast<idx_t>(j); } } } if ((len - i)>= 96) { for (; i <= len - 96; i += 96) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+64); return static_cast<idx_t>(j); } } } if ((len - i)>= 64) { for (; i <= len - 64; i += 64) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32))); v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32))); cmp = _mm256_cmpeq_epi8(v1, v2); mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + (i+32); return static_cast<idx_t>(j); } } } if ((len - i) >= 32) { for (; i <= len - 32; i += 32) { __m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i)); __m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i)); __m256i cmp = _mm256_cmpeq_epi8(v1, v2); int mask = _mm256_movemask_epi8(cmp); if (mask != 0xFFFFFFFF) { int j = __builtin_ctz(~mask) + i; return static_cast<idx_t>(j); } } } for (; i < len; i++) { if (str1[i] != str2[i]) { break; } } return static_cast<idx_t>(i); } ```