I managed to combine the algorithm from the source link into a mask that works on the reverse way as the others presented here, while using fewer operations: roughly 7*N + 2 operations per N distinct bytes to detect:
uint64_t maskbytes(uint64_t v) {
const uint64_t ones = 0x0101010101010101ULL;
const uint64_t high = 0x8080808080808080ULL;
uint64_t mask10 = v ^ (0x10 * ones);
uint64_t mask23 = v ^ (0x23 * ones);
uint64_t mask45 = v ^ (0x45 * ones);
mask10 = ~(mask10 | ((mask10 | high) - ones)) & high;
mask23 = ~(mask23 | ((mask23 | high) - ones)) & high;
mask45 = ~(mask45 | ((mask45 | high) - ones)) & high;
uint64_t mask = ((mask10 | mask23 | mask45) >> 7) * 255;
return v & ~mask;
}
Thanks everyone for your contributions