37 #include <x86intrin.h>
40 #define __STRINGIFY(s) #s
41 #define STRINGIFY(s) __STRINGIFY(s)
44 #define likely(x) __builtin_expect(!!(x), 1)
45 #define unlikely(x) __builtin_expect(!!(x), 0)
47 #define ONES_STEP_4 (UINT64_C(0x1111111111111111))
48 #define ONES_STEP_8 (UINT64_C(0x0101010101010101))
49 #define ONES_STEP_9 (UINT64_C(1) << 0 | UINT64_C(1) << 9 | UINT64_C(1) << 18 | UINT64_C(1) << 27 | UINT64_C(1) << 36 | UINT64_C(1) << 45 | UINT64_C(1) << 54)
50 #define ONES_STEP_16 (UINT64_C(1) << 0 | UINT64_C(1) << 16 | UINT64_C(1) << 32 | UINT64_C(1) << 48)
51 #define ONES_STEP_32 (UINT64_C(0x0000000100000001))
52 #define MSBS_STEP_4 (UINT64_C(0x8) * ONES_STEP_4)
53 #define MSBS_STEP_8 (UINT64_C(0x80) * ONES_STEP_8)
54 #define MSBS_STEP_9 (UINT64_C(0x100) * ONES_STEP_9)
55 #define MSBS_STEP_16 (UINT64_C(0x8000) * ONES_STEP_16)
56 #define MSBS_STEP_32 (UINT64_C(0x8000000080000000))
57 #define ULEQ_STEP_9(x, y) (((((((y) | MSBS_STEP_9) - ((x) & ~MSBS_STEP_9)) | (x ^ y)) ^ (x & ~y)) & MSBS_STEP_9) >> 8)
58 #define ULEQ_STEP_16(x, y) (((((((y) | MSBS_STEP_16) - ((x) & ~MSBS_STEP_16)) | (x ^ y)) ^ (x & ~y)) & MSBS_STEP_16) >> 15)
64 using std::make_unique;
65 using std::unique_ptr;
89 using auint64_t = std::uint64_t __attribute__((__may_alias__));
91 using auint32_t = std::uint32_t __attribute__((__may_alias__));
92 using auint16_t = std::uint16_t __attribute__((__may_alias__));
93 using auint8_t = std::uint8_t __attribute__((__may_alias__));
97 static constexpr uint64_t BYTE_MASK[] = {0x0ULL, 0xFFULL, 0xFFFFULL, 0xFFFFFFULL, 0xFFFFFFFFULL, 0xFFFFFFFFFFULL, 0xFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
103 int inline ceil_log2(
const uint64_t x) {
return x <= 2 ? x - 1 : 64 - __builtin_clzll(x - 1); }
126 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
127 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
128 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
129 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
130 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1,
131 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1,
132 8, 7, 7, 1, 7, 2, 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1,
133 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1,
134 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2,
135 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2,
136 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2,
137 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2,
138 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8, 5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3,
139 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6, 6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
140 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3,
141 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
142 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4,
143 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4,
144 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4,
145 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4,
146 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5,
147 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5,
148 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5,
149 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5,
150 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
151 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
152 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
153 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
154 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
155 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
156 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
157 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
168 inline int rho(uint64_t word) {
return __builtin_ctzll(word); }
178 inline int lambda(uint64_t word) {
return 63 ^ __builtin_clzll(word); }
188 inline int lambda_safe(uint64_t word) {
return word == 0 ? -1 : 63 ^ __builtin_clzll(word); }
196 return _blsr_u64(word);
198 return word & (word - 1);
208 inline uint64_t
mask_rho(uint64_t word) {
return word & (-word); }
216 inline uint64_t
mask_lambda(uint64_t word) {
return 0x8000000000000000ULL >> __builtin_clzll(word); }
226 inline uint64_t
compact_bitmask(
size_t count,
size_t pos) {
return (-(count != 0ULL)) & (UINT64_MAX >> (64 - count)) << pos; }
228 static inline uint64_t remap16(uint64_t x, uint64_t n) {
229 static const int masklen = 48;
230 static const uint64_t mask = (uint64_t(1) << masklen) - 1;
231 return ((x & mask) * n) >> masklen;
234 static inline uint64_t remap128(uint64_t x, uint64_t n) {
235 #ifdef __SIZEOF_INT128__
236 return (uint64_t)(((__uint128_t)x * (__uint128_t)n) >> 64);
239 return (uint32_t)x * n >> 32;
240 #endif // __SIZEOF_INT128__
252 inline uint64_t
bitextract(
const uint64_t *word,
int from,
int length) {
253 if (
likely((from + length) <= 64))
254 return (word[0] >> from) & (-1ULL >> (64 - length));
256 return (word[0] >> from) | ((word[1] << (128 - from - length)) >> (64 - from));
259 inline uint64_t
byteread(
const void *
const word,
int length) {
261 memcpy(&ret, word,
sizeof(uint64_t));
262 return ret & BYTE_MASK[length];
265 inline void bytewrite(
void *
const word,
int length, uint64_t val) {
267 memcpy(&old, word,
sizeof(uint64_t));
269 old = (old & ~BYTE_MASK[length]) | (val & BYTE_MASK[length]);
270 memcpy(word, &old,
sizeof(uint64_t));
275 memcpy(&value, word,
sizeof(uint64_t));
277 memcpy(word, &value,
sizeof(uint64_t));
280 inline uint64_t
bitread(
const void *
const word,
int from,
int length) {
282 memcpy(&ret, word,
sizeof(uint64_t));
284 if (
likely((from + length) <= 64)) {
285 return (ret >> from) & (-1ULL >> (64 - length));
288 memcpy(&next, static_cast<const uint64_t *>(word) + 1,
sizeof(uint64_t));
289 return (ret >> from) | (next << (128 - from - length) >> (64 - length));
293 inline void bitwrite(
void *word,
int from,
int length, uint64_t val) {
295 memcpy(&old, word,
sizeof(uint64_t));
296 assert(length == 64 || val < (1ULL << length));
298 if (
likely((from + length) <= 64)) {
299 const uint64_t mask = (-1ULL >> (64 - length)) << from;
300 old = (old & ~mask) | (val << from);
301 memcpy(word, &old,
sizeof(uint64_t));
303 const uint64_t maskw = -1ULL << from;
304 old = (old & ~maskw) | (val << from);
305 memcpy(word, &old,
sizeof(uint64_t));
308 memcpy(&next, static_cast<uint64_t *>(word) + 1,
sizeof(uint64_t));
309 const uint64_t maskb = -1ULL >> (128 - from - length);
310 next = (next & ~maskb) | (val >> (64 - from));
311 memcpy(static_cast<uint64_t *>(word) + 1, &next,
sizeof(uint64_t));
315 inline void bitwrite_inc(
void *
const word,
int from,
int length, uint64_t inc) {
317 memcpy(&value, word,
sizeof(uint64_t));
318 const uint64_t sum = (value >> from) + inc;
319 const uint64_t carry = from > 0 ? sum >> (64 - from) : 0;
321 if (
likely((from + length) <= 64 || carry == 0)) {
322 value += inc << from;
323 memcpy(word, &value,
sizeof(uint64_t));
325 value = from > 0 ? (value & (-1ULL >> (64 - from))) | (sum << from) : (sum << from);
326 memcpy(word, &value,
sizeof(uint64_t));
329 memcpy(&next, static_cast<uint64_t *>(word) + 1,
sizeof(uint64_t));
331 memcpy(static_cast<uint64_t *>(word) + 1, &next,
sizeof(uint64_t));
339 inline int nu(uint64_t word) {
return __builtin_popcountll(word); }
346 inline uint64_t
mround(uint64_t number, uint64_t multiple) {
return ((number - 1) | (multiple - 1)) + 1; }
374 constexpr uint64_t kOnesStep4 = 0x1111111111111111ULL;
375 constexpr uint64_t kOnesStep8 = 0x0101010101010101ULL;
376 constexpr uint64_t kLAMBDAsStep8 = 0x80ULL * kOnesStep8;
379 s = s - ((s & 0xA * kOnesStep4) >> 1);
380 s = (s & 0x3 * kOnesStep4) + ((s >> 2) & 0x3 * kOnesStep4);
381 s = (s + (s >> 4)) & 0xF * kOnesStep8;
382 uint64_t byteSums = s * kOnesStep8;
384 uint64_t kStep8 = k * kOnesStep8;
385 uint64_t geqKStep8 = (((kStep8 | kLAMBDAsStep8) - byteSums) & kLAMBDAsStep8);
386 uint64_t place =
nu(geqKStep8) * 8;
387 uint64_t byteRank = k - (((byteSums << 8) >> place) & uint64_t(0xFF));
388 return place +
kSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)];
389 #elif defined(__GNUC__) || defined(__clang__)
391 uint64_t result = uint64_t(1) << k;
393 asm(
"pdep %1, %0, %0\n\t"
400 return _tzcnt_u64(_pdep_u64(1ULL << k, x));
409 } bint = {0x01020304};
411 return bint.c[0] == 1;
421 template <
class T>
typename std::enable_if<std::is_integral<T>::value, T>::type
swap_endian(T value) {
426 return ((uint16_t)swap_endian<std::uint8_t>(value & 0x00ff) << 8) | (uint16_t)swap_endian<std::uint8_t>(value >> 8);
428 return ((uint32_t)swap_endian<std::uint16_t>(value & 0x0000ffff) << 16) | (uint32_t)swap_endian<std::uint16_t>(value >> 16);
430 #pragma GCC diagnostic push
431 #pragma GCC diagnostic ignored "-Wshift-count-overflow"
432 return ((uint64_t)swap_endian<std::uint32_t>(value & 0x00000000ffffffffULL) << 32) | (uint64_t)swap_endian<std::uint32_t>(value >> 32);
433 #pragma GCC diagnostic pop
435 assert(
false &&
"unsupported size");
449 template <
class T> T
ntoh(T value) {
return hton(value); }
461 template <
class T> T
htol(T value) {
return ltoh(value); }