Sux
RecSplit.hpp
Go to the documentation of this file.
1 /*
2  * Sux: Succinct data structures
3  *
4  * Copyright (C) 2019-2020 Emmanuel Esposito and Sebastiano Vigna
5  *
6  * This library is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU Lesser General Public License as published by the Free
8  * Software Foundation; either version 3 of the License, or (at your option)
9  * any later version.
10  *
11  * This library is free software; you can redistribute it and/or modify it under
12  * the terms of the GNU General Public License as published by the Free Software
13  * Foundation; either version 3, or (at your option) any later version.
14  *
15  * This library is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
17  * PARTICULAR PURPOSE. See the GNU General Public License for more details.
18  *
19  * Under Section 7 of GPL version 3, you are granted additional permissions
20  * described in the GCC Runtime Library Exception, version 3.1, as published by
21  * the Free Software Foundation.
22  *
23  * You should have received a copy of the GNU General Public License and a copy of
24  * the GCC Runtime Library Exception along with this program; see the files
25  * COPYING3 and COPYING.RUNTIME respectively. If not, see
26  * <http://www.gnu.org/licenses/>.
27  */
28 
29 #pragma once
30 
31 #include "../support/SpookyV2.hpp"
32 #include "../util/Vector.hpp"
33 #include "DoubleEF.hpp"
34 #include "RiceBitVector.hpp"
35 #include <array>
36 #include <cassert>
37 #include <chrono>
38 #include <cmath>
39 #include <string>
40 #include <vector>
41 
42 namespace sux::function {
43 
44 using namespace std;
45 using namespace std::chrono;
46 
47 // Assumed *maximum* size of a bucket. Works with high probability up to average bucket size ~2000.
48 static const int MAX_BUCKET_SIZE = 3000;
49 
50 static const int MAX_LEAF_SIZE = 24;
51 static const int MAX_FANOUT = 32;
52 
53 #if defined(MORESTATS) && !defined(STATS)
54 #define STATS
55 #endif
56 
57 #ifdef MORESTATS
58 
59 #define MAX_LEVEL_TIME (20)
60 
61 static constexpr double log2e = 1.44269504089;
62 static uint64_t num_bij_trials[MAX_LEAF_SIZE], num_split_trials;
63 static uint64_t num_bij_evals[MAX_LEAF_SIZE], num_split_evals;
64 static uint64_t bij_count[MAX_LEAF_SIZE], split_count;
65 static uint64_t expected_split_trials, expected_split_evals;
66 static uint64_t bij_unary, bij_fixed, bij_unary_golomb, bij_fixed_golomb;
67 static uint64_t split_unary, split_fixed, split_unary_golomb, split_fixed_golomb;
68 static uint64_t max_split_code, min_split_code, sum_split_codes;
69 static uint64_t max_bij_code, min_bij_code, sum_bij_codes;
70 static uint64_t sum_depths;
71 static uint64_t time_bij;
72 static uint64_t time_split[MAX_LEVEL_TIME];
73 #endif
74 
75 // Starting seed at given distance from the root (extracted at random).
76 static const uint64_t start_seed[] = {0x106393c187cae21a, 0x6453cec3f7376937, 0x643e521ddbd2be98, 0x3740c6412f6572cb, 0x717d47562f1ce470, 0x4cd6eb4c63befb7c, 0x9bfd8c5e18c8da73,
77  0x082f20e10092a9a3, 0x2ada2ce68d21defc, 0xe33cb4f3e7c6466b, 0x3980be458c509c59, 0xc466fd9584828e8c, 0x45f0aabe1a61ede6, 0xf6e7b8b33ad9b98d,
78  0x4ef95e25f4b4983d, 0x81175195173b92d3, 0x4e50927d8dd15978, 0x1ea2099d1fafae7f, 0x425c8a06fbaaa815, 0xcd4216006c74052a};
79 
88 uint64_t inline remix(uint64_t z) {
89  z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
90  z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
91  return z ^ (z >> 31);
92 }
93 
102 typedef struct __hash128_t {
103  uint64_t first, second;
104  bool operator<(const __hash128_t &o) const { return first < o.first || second < o.second; }
105  __hash128_t(const uint64_t first, const uint64_t second) {
106  this->first = first;
107  this->second = second;
108  }
109 } hash128_t;
110 
118 hash128_t inline spooky(const void *data, const size_t length, const uint64_t seed) {
119  uint64_t h0 = seed, h1 = seed;
120  SpookyHash::Hash128(data, length, &h0, &h1);
121  return {h1, h0};
122 }
123 
124 // Quick replacements for min/max on not-so-large integers.
125 
126 static constexpr inline uint64_t min(int64_t x, int64_t y) { return y + ((x - y) & ((x - y) >> 63)); }
127 static constexpr inline uint64_t max(int64_t x, int64_t y) { return x - ((x - y) & ((x - y) >> 63)); }
128 
129 // Optimal Golomb-Rice parameters for leaves.
130 static constexpr uint8_t bij_memo[] = {0, 0, 0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30};
131 
132 #ifdef MORESTATS
133 // Optimal Golomb code moduli for leaves (for stats).
134 static constexpr uint64_t bij_memo_golomb[] = {0, 0, 1, 3, 7, 18, 45, 113, 288, 740,
135  1910, 4954, 12902, 33714, 88350, 232110, 611118, 1612087, 4259803, 11273253,
136  29874507, 79265963, 210551258, 559849470, 1490011429, 3968988882, 10580669970, 28226919646, 75354118356};
137 #endif
138 
145 template <size_t LEAF_SIZE> class SplittingStrategy {
146  static constexpr size_t _leaf = LEAF_SIZE;
147  static_assert(_leaf >= 1);
148  static_assert(_leaf <= MAX_LEAF_SIZE);
149  size_t m, curr_unit, curr_index, last_unit;
150  size_t _fanout;
151  size_t unit;
152 
153  inline size_t part_size() const { return (curr_index < _fanout - 1) ? unit : last_unit; }
154 
155  public:
157  static constexpr size_t lower_aggr = _leaf * max(2, ceil(0.35 * _leaf + 1. / 2));
159  static constexpr size_t upper_aggr = lower_aggr * (_leaf < 7 ? 2 : ceil(0.21 * _leaf + 9. / 10));
160 
161  static inline constexpr void split_params(const size_t m, size_t &fanout, size_t &unit) {
162  if (m > upper_aggr) { // High-level aggregation (fanout 2)
163  unit = upper_aggr * (uint16_t(m / 2 + upper_aggr - 1) / upper_aggr);
164  fanout = 2;
165  } else if (m > lower_aggr) { // Second-level aggregation
166  unit = lower_aggr;
167  fanout = uint16_t(m + lower_aggr - 1) / lower_aggr;
168  } else { // First-level aggregation
169  unit = _leaf;
170  fanout = uint16_t(m + _leaf - 1) / _leaf;
171  }
172  }
173 
174  // Note that you can call this iterator only *once*.
176  SplittingStrategy *strat;
177 
178  public:
179  using value_type = size_t;
180  using difference_type = ptrdiff_t;
181  using pointer = size_t *;
182  using reference = size_t &;
183  using iterator_category = input_iterator_tag;
184 
185  split_iterator(SplittingStrategy *strat) : strat(strat) {}
186  size_t operator*() const { return strat->curr_unit; }
187  size_t *operator->() const { return &strat->curr_unit; }
189  ++strat->curr_index;
190  strat->curr_unit = strat->part_size();
191  strat->last_unit -= strat->curr_unit;
192  return *this;
193  }
194  bool operator==(const split_iterator &other) const { return strat == other.strat; }
195  bool operator!=(const split_iterator &other) const { return !(*this == other); }
196  };
197 
198  explicit SplittingStrategy(size_t m) : m(m), last_unit(m), curr_index(0), curr_unit(0) {
199  split_params(m, _fanout, unit);
200  this->curr_unit = part_size();
201  this->last_unit -= this->curr_unit;
202  }
203 
204  split_iterator begin() { return split_iterator(this); }
205  split_iterator end() { return split_iterator(nullptr); }
206 
207  inline size_t fanout() { return this->_fanout; }
208 };
209 
210 // Generates the precomputed table of 32-bit values holding the Golomb-Rice code
211 // of a splitting (upper 5 bits), the number of nodes in the associated subtree
212 // (following 11 bits) and the sum of the Golomb-Rice codelengths in the same
213 // subtree (lower 16 bits).
214 
215 template <size_t LEAF_SIZE> static constexpr void _fill_golomb_rice(const int m, array<uint32_t, MAX_BUCKET_SIZE> *memo) {
216  array<int, MAX_FANOUT> k{0};
217 
218  size_t fanout = 0, unit = 0;
220 
221  k[fanout - 1] = m;
222  for (size_t i = 0; i < fanout - 1; ++i) {
223  k[i] = unit;
224  k[fanout - 1] -= k[i];
225  }
226 
227  double sqrt_prod = 1;
228  for (size_t i = 0; i < fanout; ++i) sqrt_prod *= sqrt(k[i]);
229 
230  const double p = sqrt(m) / (pow(2 * M_PI, (fanout - 1.) / 2) * sqrt_prod);
231  auto golomb_rice_length = (uint32_t)ceil(log2(-log((sqrt(5) + 1) / 2) / log1p(-p))); // log2 Golomb modulus
232 
233  assert(golomb_rice_length <= 0x1F); // Golomb-Rice code, stored in the 5 upper bits
234  (*memo)[m] = golomb_rice_length << 27;
235  for (size_t i = 0; i < fanout; ++i) golomb_rice_length += (*memo)[k[i]] & 0xFFFF;
236  assert(golomb_rice_length <= 0xFFFF); // Sum of Golomb-Rice codeslengths in the subtree, stored in the lower 16 bits
237  (*memo)[m] |= golomb_rice_length;
238 
239  uint32_t nodes = 1;
240  for (size_t i = 0; i < fanout; ++i) nodes += ((*memo)[k[i]] >> 16) & 0x7FF;
241  assert(LEAF_SIZE < 3 || nodes <= 0x7FF); // Number of nodes in the subtree, stored in the middle 11 bits
242  (*memo)[m] |= nodes << 16;
243 }
244 
245 template <size_t LEAF_SIZE> static constexpr array<uint32_t, MAX_BUCKET_SIZE> fill_golomb_rice() {
246  array<uint32_t, MAX_BUCKET_SIZE> memo{0};
247  size_t s = 0;
248  for (; s <= LEAF_SIZE; ++s) memo[s] = bij_memo[s] << 27 | (s > 1) << 16 | bij_memo[s];
249  for (; s < MAX_BUCKET_SIZE; ++s) _fill_golomb_rice<LEAF_SIZE>(s, &memo);
250  return memo;
251 }
252 
253 // Computes the Golomb modulu of a splitting (for statistics purposes only)
254 
255 template <size_t LEAF_SIZE> static constexpr uint64_t split_golomb_b(const int m) {
256  array<int, MAX_FANOUT> k{0};
257 
258  size_t fanout = 0, part = 0, unit = 0;
260 
261  k[fanout - 1] = m;
262  for (size_t i = 0; i < fanout - 1; ++i) {
263  k[i] = unit;
264  k[fanout - 1] -= k[i];
265  }
266 
267  double sqrt_prod = 1;
268  for (size_t i = 0; i < fanout; ++i) sqrt_prod *= sqrt(k[i]);
269 
270  const double p = sqrt(m) / (pow(2 * M_PI, (fanout - 1.) / 2) * sqrt_prod);
271  return ceil(-log(2 - p) / log1p(-p)); // Golomb modulus
272 }
273 
274 // Computes the point at which one should stop to test whether
275 // bijection extraction failed (around the square root of the leaf size).
276 
277 static constexpr array<uint8_t, MAX_LEAF_SIZE> fill_bij_midstop() {
278  array<uint8_t, MAX_LEAF_SIZE> memo{0};
279  for (int s = 0; s < MAX_LEAF_SIZE; ++s) memo[s] = s < (int)ceil(2 * sqrt(s)) ? s : (int)ceil(2 * sqrt(s));
280  return memo;
281 }
282 
283 #define first_hash(k, len) spooky(k, len, 0)
284 #define golomb_param(m) (memo[m] >> 27)
285 #define skip_bits(m) (memo[m] & 0xFFFF)
286 #define skip_nodes(m) ((memo[m] >> 16) & 0x7FF)
287 
299 template <size_t LEAF_SIZE, util::AllocType AT = util::AllocType::MALLOC> class RecSplit {
301 
302  static constexpr size_t _leaf = LEAF_SIZE;
303  static constexpr size_t lower_aggr = SplitStrat::lower_aggr;
304  static constexpr size_t upper_aggr = SplitStrat::upper_aggr;
305 
306  // For each bucket size, the Golomb-Rice parameter (upper 8 bits) and the number of bits to
307  // skip in the fixed part of the tree (lower 24 bits).
308  static constexpr array<uint32_t, MAX_BUCKET_SIZE> memo = fill_golomb_rice<LEAF_SIZE>();
309  static constexpr array<uint8_t, MAX_LEAF_SIZE> bij_midstop = fill_bij_midstop();
310 
311  size_t bucket_size;
312  size_t nbuckets;
313  size_t keys_count;
314  RiceBitVector<AT> descriptors;
315  DoubleEF<AT> ef;
316 
317  public:
318  RecSplit() {}
319 
329  RecSplit(const vector<string> &keys, const size_t bucket_size) {
330  this->bucket_size = bucket_size;
331  this->keys_count = keys.size();
332  hash128_t *h = (hash128_t *)malloc(this->keys_count * sizeof(hash128_t));
333  for (size_t i = 0; i < this->keys_count; ++i) {
334  h[i] = first_hash(keys[i].c_str(), keys[i].size());
335  }
336  hash_gen(h);
337  free(h);
338  }
339 
350  RecSplit(vector<hash128_t> &keys, const size_t bucket_size) {
351  this->bucket_size = bucket_size;
352  this->keys_count = keys.size();
353  hash_gen(&keys[0]);
354  }
355 
363  RecSplit(ifstream& input, const size_t bucket_size) {
364  this->bucket_size = bucket_size;
365  vector<hash128_t> h;
366  for(string key; getline(input, key);) h.push_back(first_hash(key.c_str(), key.size()));
367  this->keys_count = h.size();
368  hash_gen(&h[0]);
369  }
370 
377  size_t operator()(const hash128_t &hash) {
378  const size_t bucket = hash128_to_bucket(hash);
379  uint64_t cum_keys, cum_keys_next, bit_pos;
380  ef.get(bucket, cum_keys, cum_keys_next, bit_pos);
381 
382  // Number of keys in this bucket
383  size_t m = cum_keys_next - cum_keys;
384 
385  descriptors.readReset(bit_pos, skip_bits(m));
386  int level = 0;
387 
388  while (m > upper_aggr) { // fanout = 2
389  const auto d = descriptors.readNext(golomb_param(m));
390  const size_t hmod = remap16(remix(hash.second + d + start_seed[level]), m);
391 
392  const uint32_t split = ((uint16_t(m / 2 + upper_aggr - 1) / upper_aggr)) * upper_aggr;
393  if (hmod < split) {
394  m = split;
395  } else {
396  descriptors.skipSubtree(skip_nodes(split), skip_bits(split));
397  m -= split;
398  cum_keys += split;
399  }
400  level++;
401  }
402  if (m > lower_aggr) {
403  const auto d = descriptors.readNext(golomb_param(m));
404  const size_t hmod = remap16(remix(hash.second + d + start_seed[level]), m);
405 
406  const int part = uint16_t(hmod) / lower_aggr;
407  m = min(lower_aggr, m - part * lower_aggr);
408  cum_keys += lower_aggr * part;
409  if (part) descriptors.skipSubtree(skip_nodes(lower_aggr) * part, skip_bits(lower_aggr) * part);
410  level++;
411  }
412 
413  if (m > _leaf) {
414  const auto d = descriptors.readNext(golomb_param(m));
415  const size_t hmod = remap16(remix(hash.second + d + start_seed[level]), m);
416 
417  const int part = uint16_t(hmod) / _leaf;
418  m = min(_leaf, m - part * _leaf);
419  cum_keys += _leaf * part;
420  if (part) descriptors.skipSubtree(part, skip_bits(_leaf) * part);
421  level++;
422  }
423 
424  const auto b = descriptors.readNext(golomb_param(m));
425  return cum_keys + remap16(remix(hash.second + b + start_seed[level]), m);
426  }
427 
433  size_t operator()(const string &key) { return operator()(first_hash(key.c_str(), key.size())); }
434 
436  inline size_t size() { return this->keys_count; }
437 
438  private:
439  // Maps a 128-bit to a bucket using the first 64-bit half.
440  inline uint64_t hash128_to_bucket(const hash128_t &hash) const { return remap128(hash.first, nbuckets); }
441 
442  // Computes and stores the splittings and bijections of a bucket.
443  void recSplit(vector<uint64_t> &bucket, typename RiceBitVector<AT>::Builder &builder, vector<uint32_t> &unary) {
444  const auto m = bucket.size();
445  vector<uint64_t> temp(m);
446  recSplit(bucket, temp, 0, bucket.size(), builder, unary, 0);
447  }
448 
449  void recSplit(vector<uint64_t> &bucket, vector<uint64_t> &temp, size_t start, size_t end, typename RiceBitVector<AT>::Builder &builder, vector<uint32_t> &unary, const int level) {
450  const auto m = end - start;
451  assert(m > 1);
452  uint64_t x = start_seed[level];
453 
454  if (m <= _leaf) {
455 #ifdef MORESTATS
456  sum_depths += m * level;
457  auto start_time = high_resolution_clock::now();
458 #endif
459  uint32_t mask;
460  const uint32_t found = (1 << m) - 1;
461  if constexpr (_leaf <= 8) {
462  for (;;) {
463  mask = 0;
464  for (size_t i = start; i < end; i++) mask |= uint32_t(1) << remap16(remix(bucket[i] + x), m);
465 #ifdef MORESTATS
466  num_bij_evals[m] += m;
467 #endif
468  if (mask == found) break;
469  x++;
470  }
471  } else {
472  const size_t midstop = bij_midstop[m];
473  for (;;) {
474  mask = 0;
475  size_t i;
476  for (i = start; i < start + midstop; i++) mask |= uint32_t(1) << remap16(remix(bucket[i] + x), m);
477 #ifdef MORESTATS
478  num_bij_evals[m] += midstop;
479 #endif
480  if (nu(mask) == midstop) {
481  for (; i < end; i++) mask |= uint32_t(1) << remap16(remix(bucket[i] + x), m);
482 #ifdef MORESTATS
483  num_bij_evals[m] += m - midstop;
484 #endif
485  if (mask == found) break;
486  }
487  x++;
488  }
489  }
490 #ifdef MORESTATS
491  time_bij += duration_cast<nanoseconds>(high_resolution_clock::now() - start_time).count();
492 #endif
493  x -= start_seed[level];
494  const auto log2golomb = golomb_param(m);
495  builder.appendFixed(x, log2golomb);
496  unary.push_back(x >> log2golomb);
497 #ifdef MORESTATS
498  bij_count[m]++;
499  num_bij_trials[m] += x + 1;
500  bij_unary += 1 + (x >> log2golomb);
501  bij_fixed += log2golomb;
502 
503  min_bij_code = min(min_bij_code, x);
504  max_bij_code = max(max_bij_code, x);
505  sum_bij_codes += x;
506 
507  auto b = bij_memo_golomb[m];
508  auto log2b = lambda(b);
509  bij_unary_golomb += x / b + 1;
510  bij_fixed_golomb += x % b < ((1 << log2b + 1) - b) ? log2b : log2b + 1;
511 #endif
512  } else {
513 #ifdef MORESTATS
514  auto start_time = high_resolution_clock::now();
515 #endif
516  if (m > upper_aggr) { // fanout = 2
517  const size_t split = ((uint16_t(m / 2 + upper_aggr - 1) / upper_aggr)) * upper_aggr;
518 
519  size_t count[2];
520  for (;;) {
521  count[0] = 0;
522  for (size_t i = start; i < end; i++) {
523  count[remap16(remix(bucket[i] + x), m) >= split]++;
524 #ifdef MORESTATS
525  ++num_split_evals;
526 #endif
527  }
528  if (count[0] == split) break;
529  x++;
530  }
531 
532  count[0] = 0;
533  count[1] = split;
534  for (size_t i = start; i < end; i++) {
535  temp[count[remap16(remix(bucket[i] + x), m) >= split]++] = bucket[i];
536  }
537  copy(&temp[0], &temp[m], &bucket[start]);
538  x -= start_seed[level];
539  const auto log2golomb = golomb_param(m);
540  builder.appendFixed(x, log2golomb);
541  unary.push_back(x >> log2golomb);
542 
543 #ifdef MORESTATS
544  time_split[min(MAX_LEVEL_TIME, level)] += duration_cast<nanoseconds>(high_resolution_clock::now() - start_time).count();
545 #endif
546  recSplit(bucket, temp, start, start + split, builder, unary, level + 1);
547  if (m - split > 1) recSplit(bucket, temp, start + split, end, builder, unary, level + 1);
548 #ifdef MORESTATS
549  else
550  sum_depths += level;
551 #endif
552  } else if (m > lower_aggr) { // 2nd aggregation level
553  const size_t fanout = uint16_t(m + lower_aggr - 1) / lower_aggr;
554  size_t count[fanout]; // Note that we never read count[fanout-1]
555  for (;;) {
556  memset(count, 0, sizeof count - sizeof *count);
557  for (size_t i = start; i < end; i++) {
558  count[uint16_t(remap16(remix(bucket[i] + x), m)) / lower_aggr]++;
559 #ifdef MORESTATS
560  ++num_split_evals;
561 #endif
562  }
563  size_t broken = 0;
564  for (size_t i = 0; i < fanout - 1; i++) broken |= count[i] - lower_aggr;
565  if (!broken) break;
566  x++;
567  }
568 
569  for (size_t i = 0, c = 0; i < fanout; i++, c += lower_aggr) count[i] = c;
570  for (size_t i = start; i < end; i++) {
571  temp[count[uint16_t(remap16(remix(bucket[i] + x), m)) / lower_aggr]++] = bucket[i];
572  }
573  copy(&temp[0], &temp[m], &bucket[start]);
574 
575  x -= start_seed[level];
576  const auto log2golomb = golomb_param(m);
577  builder.appendFixed(x, log2golomb);
578  unary.push_back(x >> log2golomb);
579 
580 #ifdef MORESTATS
581  time_split[min(MAX_LEVEL_TIME, level)] += duration_cast<nanoseconds>(high_resolution_clock::now() - start_time).count();
582 #endif
583  size_t i;
584  for (i = 0; i < m - lower_aggr; i += lower_aggr) {
585  recSplit(bucket, temp, start + i, start + i + lower_aggr, builder, unary, level + 1);
586  }
587  if (m - i > 1) recSplit(bucket, temp, start + i, end, builder, unary, level + 1);
588 #ifdef MORESTATS
589  else
590  sum_depths += level;
591 #endif
592  } else { // First aggregation level, m <= lower_aggr
593  const size_t fanout = uint16_t(m + _leaf - 1) / _leaf;
594  size_t count[fanout]; // Note that we never read count[fanout-1]
595  for (;;) {
596  memset(count, 0, sizeof count - sizeof *count);
597  for (size_t i = start; i < end; i++) {
598  count[uint16_t(remap16(remix(bucket[i] + x), m)) / _leaf]++;
599 #ifdef MORESTATS
600  ++num_split_evals;
601 #endif
602  }
603  size_t broken = 0;
604  for (size_t i = 0; i < fanout - 1; i++) broken |= count[i] - _leaf;
605  if (!broken) break;
606  x++;
607  }
608  for (size_t i = 0, c = 0; i < fanout; i++, c += _leaf) count[i] = c;
609  for (size_t i = start; i < end; i++) {
610  temp[count[uint16_t(remap16(remix(bucket[i] + x), m)) / _leaf]++] = bucket[i];
611  }
612  copy(&temp[0], &temp[m], &bucket[start]);
613 
614  x -= start_seed[level];
615  const auto log2golomb = golomb_param(m);
616  builder.appendFixed(x, log2golomb);
617  unary.push_back(x >> log2golomb);
618 
619 #ifdef MORESTATS
620  time_split[min(MAX_LEVEL_TIME, level)] += duration_cast<nanoseconds>(high_resolution_clock::now() - start_time).count();
621 #endif
622  size_t i;
623  for (i = 0; i < m - _leaf; i += _leaf) {
624  recSplit(bucket, temp, start + i, start + i + _leaf, builder, unary, level + 1);
625  }
626  if (m - i > 1) recSplit(bucket, temp, start + i, end, builder, unary, level + 1);
627 #ifdef MORESTATS
628  else
629  sum_depths += level;
630 #endif
631  }
632 
633 #ifdef MORESTATS
634  ++split_count;
635  num_split_trials += x + 1;
636  double e_trials = 1;
637  size_t aux = m;
638  SplitStrat strat{m};
639  auto v = strat.begin();
640  for (int i = 0; i < strat.fanout(); ++i, ++v) {
641  e_trials *= pow((double)m / *v, *v);
642  for (size_t j = *v; j > 0; --j, --aux) {
643  e_trials *= (double)j / aux;
644  }
645  }
646  expected_split_trials += (size_t)e_trials;
647  expected_split_evals += (size_t)e_trials * m;
648  const auto log2golomb = golomb_param(m);
649  split_unary += 1 + (x >> log2golomb);
650  split_fixed += log2golomb;
651 
652  min_split_code = min(min_split_code, x);
653  max_split_code = max(max_split_code, x);
654  sum_split_codes += x;
655 
656  auto b = split_golomb_b<LEAF_SIZE>(m);
657  auto log2b = lambda(b);
658  split_unary_golomb += x / b + 1;
659  split_fixed_golomb += x % b < ((1ULL << log2b + 1) - b) ? log2b : log2b + 1;
660 #endif
661  }
662  }
663 
664  void hash_gen(hash128_t *hashes) {
665 #ifdef MORESTATS
666  time_bij = 0;
667  memset(time_split, 0, sizeof time_split);
668  split_unary = split_fixed = 0;
669  bij_unary = bij_fixed = 0;
670  min_split_code = 1UL << 63;
671  max_split_code = sum_split_codes = 0;
672  min_bij_code = 1UL << 63;
673  max_bij_code = sum_bij_codes = 0;
674  sum_depths = 0;
675  size_t minsize = keys_count, maxsize = 0;
676  double ub_split_bits = 0, ub_bij_bits = 0;
677  double ub_split_evals = 0, ub_bij_evals = 0;
678 #endif
679 
680 #ifndef __SIZEOF_INT128__
681  if (keys_count > (1ULL << 32)) {
682  fprintf(stderr, "For more than 2^32 keys, you need 128-bit integer support.\n");
683  abort();
684  }
685 #endif
686  nbuckets = max(1, (keys_count + bucket_size - 1) / bucket_size);
687  auto bucket_size_acc = vector<int64_t>(nbuckets + 1);
688  auto bucket_pos_acc = vector<int64_t>(nbuckets + 1);
689 
690  sort(hashes, hashes + keys_count, [this](const hash128_t &a, const hash128_t &b) { return hash128_to_bucket(a) < hash128_to_bucket(b); });
691  typename RiceBitVector<AT>::Builder builder;
692 
693  bucket_size_acc[0] = bucket_pos_acc[0] = 0;
694  for (size_t i = 0, last = 0; i < nbuckets; i++) {
695  vector<uint64_t> bucket;
696  for (; last < keys_count && hash128_to_bucket(hashes[last]) == i; last++) bucket.push_back(hashes[last].second);
697 
698  const size_t s = bucket.size();
699  bucket_size_acc[i + 1] = bucket_size_acc[i] + s;
700  if (bucket.size() > 1) {
701  vector<uint32_t> unary;
702  recSplit(bucket, builder, unary);
703  builder.appendUnaryAll(unary);
704  }
705  bucket_pos_acc[i + 1] = builder.getBits();
706 #ifdef MORESTATS
707  auto upper_leaves = (s + _leaf - 1) / _leaf;
708  auto upper_height = ceil(log(upper_leaves) / log(2)); // TODO: check
709  auto upper_s = _leaf * pow(2, upper_height);
710  ub_split_bits += (double)upper_s / (_leaf * 2) * log2(2 * M_PI * _leaf) - .5 * log2(2 * M_PI * upper_s);
711  ub_bij_bits += upper_leaves * _leaf * (log2e - .5 / _leaf * log2(2 * M_PI * _leaf));
712  ub_split_evals += 4 * upper_s * sqrt(pow(2 * M_PI * upper_s, 2 - 1) / pow(2, 2));
713  minsize = min(minsize, s);
714  maxsize = max(maxsize, s);
715 #endif
716  }
717  builder.appendFixed(1, 1); // Sentinel (avoids checking for parts of size 1)
718  descriptors = builder.build();
719  ef = DoubleEF<AT>(vector<uint64_t>(bucket_size_acc.begin(), bucket_size_acc.end()), vector<uint64_t>(bucket_pos_acc.begin(), bucket_pos_acc.end()));
720 
721 #ifdef STATS
722  // Evaluation purposes only
723  double ef_sizes = (double)ef.bitCountCumKeys() / keys_count;
724  double ef_bits = (double)ef.bitCountPosition() / keys_count;
725  double rice_desc = (double)builder.getBits() / keys_count;
726  printf("Elias-Fano cumul sizes: %f bits/bucket\n", (double)ef.bitCountCumKeys() / nbuckets);
727  printf("Elias-Fano cumul bits: %f bits/bucket\n", (double)ef.bitCountPosition() / nbuckets);
728  printf("Elias-Fano cumul sizes: %f bits/key\n", ef_sizes);
729  printf("Elias-Fano cumul bits: %f bits/key\n", ef_bits);
730  printf("Rice-Golomb descriptors: %f bits/key\n", rice_desc);
731  printf("Total bits: %f bits/key\n", ef_sizes + ef_bits + rice_desc);
732 #endif
733 #ifdef MORESTATS
734 
735  printf("\n");
736  printf("Min bucket size: %lu\n", minsize);
737  printf("Max bucket size: %lu\n", maxsize);
738 
739  printf("\n");
740  printf("Bijections: %13.3f ms\n", time_bij * 1E-6);
741  for (int i = 0; i < MAX_LEVEL_TIME; i++) {
742  if (time_split[i] > 0) {
743  printf("Split level %d: %10.3f ms\n", i, time_split[i] * 1E-6);
744  }
745  }
746 
747  uint64_t fact = 1, tot_bij_count = 0, tot_bij_evals;
748  printf("\n");
749  printf("Bij count trials exp evals exp tot evals\n");
750  for (int i = 0; i < MAX_LEAF_SIZE; i++) {
751  if (num_bij_trials[i] != 0) {
752  tot_bij_count += bij_count[i];
753  tot_bij_evals += num_bij_evals[i];
754  printf("%-3d%20d%20.2f%20.2f%20.2f%20.2f%20lld\n", i, bij_count[i], (double)num_bij_trials[i] / bij_count[i], pow(i, i) / fact, (double)num_bij_evals[i] / bij_count[i],
755  (_leaf <= 8 ? i : bij_midstop[i]) * pow(i, i) / fact, num_bij_evals[i]);
756  }
757  fact *= (i + 1);
758  }
759 
760  printf("\n");
761  printf("Split count: %16zu\n", split_count);
762 
763  printf("Total split evals: %16lld\n", num_split_evals);
764  printf("Total bij evals: %16lld\n", tot_bij_evals);
765  printf("Total evals: %16lld\n", num_split_evals + tot_bij_evals);
766 
767  printf("\n");
768  printf("Average depth: %f\n", (double)sum_depths / keys_count);
769  printf("\n");
770  printf("Trials per split: %16.3f\n", (double)num_split_trials / split_count);
771  printf("Exp trials per split: %16.3f\n", (double)expected_split_trials / split_count);
772  printf("Evals per split: %16.3f\n", (double)num_split_evals / split_count);
773  printf("Exp evals per split: %16.3f\n", (double)expected_split_evals / split_count);
774 
775  printf("\n");
776  printf("Unary bits per bij: %10.5f\n", (double)bij_unary / tot_bij_count);
777  printf("Fixed bits per bij: %10.5f\n", (double)bij_fixed / tot_bij_count);
778  printf("Total bits per bij: %10.5f\n", (double)(bij_unary + bij_fixed) / tot_bij_count);
779 
780  printf("\n");
781  printf("Unary bits per split: %10.5f\n", (double)split_unary / split_count);
782  printf("Fixed bits per split: %10.5f\n", (double)split_fixed / split_count);
783  printf("Total bits per split: %10.5f\n", (double)(split_unary + split_fixed) / split_count);
784  printf("Total bits per key: %10.5f\n", (double)(bij_unary + bij_fixed + split_unary + split_fixed) / keys_count);
785 
786  printf("\n");
787  printf("Unary bits per bij (Golomb): %10.5f\n", (double)bij_unary_golomb / tot_bij_count);
788  printf("Fixed bits per bij (Golomb): %10.5f\n", (double)bij_fixed_golomb / tot_bij_count);
789  printf("Total bits per bij (Golomb): %10.5f\n", (double)(bij_unary_golomb + bij_fixed_golomb) / tot_bij_count);
790 
791  printf("\n");
792  printf("Unary bits per split (Golomb): %10.5f\n", (double)split_unary_golomb / split_count);
793  printf("Fixed bits per split (Golomb): %10.5f\n", (double)split_fixed_golomb / split_count);
794  printf("Total bits per split (Golomb): %10.5f\n", (double)(split_unary_golomb + split_fixed_golomb) / split_count);
795  printf("Total bits per key (Golomb): %10.5f\n", (double)(bij_unary_golomb + bij_fixed_golomb + split_unary_golomb + split_fixed_golomb) / keys_count);
796 
797  printf("\n");
798 
799  printf("Total split bits %16.3f\n", (double)split_fixed + split_unary);
800  printf("Upper bound split bits: %16.3f\n", ub_split_bits);
801  printf("Total bij bits: %16.3f\n", (double)bij_fixed + bij_unary);
802  printf("Upper bound bij bits: %16.3f\n\n", ub_bij_bits);
803 #endif
804  }
805 
806  friend ostream &operator<<(ostream &os, const RecSplit<LEAF_SIZE, AT> &rs) {
807  const size_t leaf_size = LEAF_SIZE;
808  os.write((char *)&leaf_size, sizeof(leaf_size));
809  os.write((char *)&rs.bucket_size, sizeof(rs.bucket_size));
810  os.write((char *)&rs.keys_count, sizeof(rs.keys_count));
811  os << rs.descriptors;
812  os << rs.ef;
813  return os;
814  }
815 
816  friend istream &operator>>(istream &is, RecSplit<LEAF_SIZE, AT> &rs) {
817  size_t leaf_size;
818  is.read((char *)&leaf_size, sizeof(leaf_size));
819  if (leaf_size != LEAF_SIZE) {
820  fprintf(stderr, "Serialized leaf size %d, code leaf size %d\n", int(leaf_size), int(LEAF_SIZE));
821  abort();
822  }
823  is.read((char *)&rs.bucket_size, sizeof(bucket_size));
824  is.read((char *)&rs.keys_count, sizeof(keys_count));
825  rs.nbuckets = max(1, (rs.keys_count + rs.bucket_size - 1) / rs.bucket_size);
826 
827  is >> rs.descriptors;
828  is >> rs.ef;
829  return is;
830  }
831 };
832 
833 } // namespace sux::function
sux::function::SplittingStrategy::split_iterator::operator==
bool operator==(const split_iterator &other) const
Definition: RecSplit.hpp:194
sux::function::RecSplit::RecSplit
RecSplit(const vector< string > &keys, const size_t bucket_size)
Definition: RecSplit.hpp:329
sux::function::RecSplit
Definition: RecSplit.hpp:299
sux::function::__hash128_t::second
uint64_t second
Definition: RecSplit.hpp:103
sux::function::SplittingStrategy::split_iterator::split_iterator
split_iterator(SplittingStrategy *strat)
Definition: RecSplit.hpp:185
sux::function::remix
uint64_t remix(uint64_t z)
Definition: RecSplit.hpp:88
sux::function::spooky
hash128_t spooky(const void *data, const size_t length, const uint64_t seed)
Definition: RecSplit.hpp:118
sux::function::SplittingStrategy::split_iterator::operator++
split_iterator & operator++()
Definition: RecSplit.hpp:188
sux::function::__hash128_t::operator<
bool operator<(const __hash128_t &o) const
Definition: RecSplit.hpp:104
sux::function::DoubleEF::get
void get(const uint64_t i, uint64_t &cum_keys, uint64_t &cum_keys_next, uint64_t &position)
Definition: DoubleEF.hpp:229
sux::function::RiceBitVector::readNext
uint64_t readNext(const int log2golomb)
Definition: RiceBitVector.hpp:129
sux::function::SplittingStrategy::split_iterator
Definition: RecSplit.hpp:175
skip_bits
#define skip_bits(m)
Definition: RecSplit.hpp:285
sux::function::RiceBitVector::readReset
void readReset(const size_t bit_pos, const size_t unary_offset)
Definition: RiceBitVector.hpp:174
sux::function::SplittingStrategy::split_params
static constexpr void split_params(const size_t m, size_t &fanout, size_t &unit)
Definition: RecSplit.hpp:161
sux::function::RecSplit::RecSplit
RecSplit(ifstream &input, const size_t bucket_size)
Definition: RecSplit.hpp:363
sux::function::DoubleEF
Definition: DoubleEF.hpp:54
sux::function::__hash128_t::first
uint64_t first
Definition: RecSplit.hpp:103
sux::function
Definition: DoubleEF.hpp:43
sux::function::SplittingStrategy::split_iterator::difference_type
ptrdiff_t difference_type
Definition: RecSplit.hpp:180
SpookyHash::Hash128
static void Hash128(const void *data, size_t length, uint64_t *hash1, uint64_t *hash2)
Definition: SpookyV2.hpp:440
sux::function::RecSplit::RecSplit
RecSplit()
Definition: RecSplit.hpp:318
DoubleEF.hpp
sux::function::SplittingStrategy::split_iterator::operator*
size_t operator*() const
Definition: RecSplit.hpp:186
sux::function::__hash128_t
Definition: RecSplit.hpp:102
sux::function::DoubleEF::bitCountCumKeys
uint64_t bitCountCumKeys()
Definition: DoubleEF.hpp:297
sux::lambda
int lambda(uint64_t word)
Definition: common.hpp:178
golomb_param
#define golomb_param(m)
Definition: RecSplit.hpp:284
sux::function::RecSplit::operator<<
friend ostream & operator<<(ostream &os, const RecSplit< LEAF_SIZE, AT > &rs)
Definition: RecSplit.hpp:806
sux::function::SplittingStrategy::split_iterator::pointer
size_t * pointer
Definition: RecSplit.hpp:181
sux::function::SplittingStrategy::SplittingStrategy
SplittingStrategy(size_t m)
Definition: RecSplit.hpp:198
sux::function::SplittingStrategy::end
split_iterator end()
Definition: RecSplit.hpp:205
sux::nu
int nu(uint64_t word)
Definition: common.hpp:339
sux::function::SplittingStrategy::split_iterator::value_type
size_t value_type
Definition: RecSplit.hpp:179
sux::function::RecSplit::size
size_t size()
Definition: RecSplit.hpp:436
sux::function::SplittingStrategy::split_iterator::iterator_category
input_iterator_tag iterator_category
Definition: RecSplit.hpp:183
sux::function::RecSplit::operator()
size_t operator()(const string &key)
Definition: RecSplit.hpp:433
sux::function::RecSplit::operator()
size_t operator()(const hash128_t &hash)
Definition: RecSplit.hpp:377
sux::function::SplittingStrategy::split_iterator::operator!=
bool operator!=(const split_iterator &other) const
Definition: RecSplit.hpp:195
sux::function::SplittingStrategy::fanout
size_t fanout()
Definition: RecSplit.hpp:207
sux::function::DoubleEF::bitCountPosition
uint64_t bitCountPosition()
Definition: DoubleEF.hpp:299
sux::function::SplittingStrategy::begin
split_iterator begin()
Definition: RecSplit.hpp:204
sux::function::RiceBitVector::skipSubtree
void skipSubtree(const size_t nodes, const size_t fixed_len)
Definition: RiceBitVector.hpp:158
sux::function::RecSplit::RecSplit
RecSplit(vector< hash128_t > &keys, const size_t bucket_size)
Definition: RecSplit.hpp:350
sux::function::RecSplit::operator>>
friend istream & operator>>(istream &is, RecSplit< LEAF_SIZE, AT > &rs)
Definition: RecSplit.hpp:816
first_hash
#define first_hash(k, len)
Definition: RecSplit.hpp:283
sux::function::hash128_t
struct sux::function::__hash128_t hash128_t
sux::function::__hash128_t::__hash128_t
__hash128_t(const uint64_t first, const uint64_t second)
Definition: RecSplit.hpp:105
sux::function::RiceBitVector
Definition: RiceBitVector.hpp:48
sux::function::SplittingStrategy::split_iterator::operator->
size_t * operator->() const
Definition: RecSplit.hpp:187
sux::function::SplittingStrategy
Definition: RecSplit.hpp:145
sux::function::SplittingStrategy::split_iterator::reference
size_t & reference
Definition: RecSplit.hpp:182
skip_nodes
#define skip_nodes(m)
Definition: RecSplit.hpp:286
RiceBitVector.hpp