# unrolled LCP
```
template <typename T_idx_>
inline T_idx_ Suffix_Array<T_idx_>::lcp_opt_avx(const char* str1, const char* str2, const idx_t len_in) {
int64_t i = 0;
int64_t len = static_cast<int64_t>(len_in);
if ((len - i)>= 256) {
for (; i <= len - 256; i += 256) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+64);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+96);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+128);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+160)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+160)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+160);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+192)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+192)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+192);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+224)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+224)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+224);
return static_cast<idx_t>(j);
}
}
}
if ((len - i)>= 224) {
for (; i <= len - 224; i += 224) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+64);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+96);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+128);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+160)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+160)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+160);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+192)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+192)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+192);
return static_cast<idx_t>(j);
}
}
}
if ((len - i)>= 192) {
for (; i <= len - 192; i += 192) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+64);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+96);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+128);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+160)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+160)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+160);
return static_cast<idx_t>(j);
}
}
}
if ((len - i)>= 160) {
for (; i <= len - 160; i += 160) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+64);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+96);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+128)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+128)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+128);
return static_cast<idx_t>(j);
}
}
}
if ((len - i)>= 128) {
for (; i <= len - 128; i += 128) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+64);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+96)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+96)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+96);
return static_cast<idx_t>(j);
}
}
}
if ((len - i)>= 96) {
for (; i <= len - 96; i += 96) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+64)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+64)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+64);
return static_cast<idx_t>(j);
}
}
}
if ((len - i)>= 64) {
for (; i <= len - 64; i += 64) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
v1 = _mm256_loadu_si256((__m256i*)(str1 + (i+32)));
v2 = _mm256_loadu_si256((__m256i*)(str2 + (i+32)));
cmp = _mm256_cmpeq_epi8(v1, v2);
mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + (i+32);
return static_cast<idx_t>(j);
}
}
}
if ((len - i) >= 32) {
for (; i <= len - 32; i += 32) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
int j = __builtin_ctz(~mask) + i;
return static_cast<idx_t>(j);
}
}
}
for (; i < len; i++) {
if (str1[i] != str2[i]) {
break;
}
}
return static_cast<idx_t>(i);
}
```