pasta::bit_vector  v1.0.0
Compact and Fast Rank and Select Data Structure for Bit Vectors
rank_select.hpp
1/*******************************************************************************
2 * This file is part of pasta::bit_vector.
3 *
4 * Copyright (C) 2021 Florian Kurpicz <florian@kurpicz.org>
5 *
6 * pasta::bit_vector is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * pasta::bit_vector is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with pasta::bit_vector. If not, see <http://www.gnu.org/licenses/>.
18 *
19 ******************************************************************************/
20
21/*
22 * Based on
23 *
24 * @inproceedings{Zhou2013RankSelect,
25 * author = {Dong Zhou and David G. Andersen and Michael Kaminsky},
26 * title = {Space-Efficient, High-Performance Rank and Select Structures
27 * on Uncompressed Bit Sequences},
28 * booktitle = {12th International Symposium on Experimental Algorithms
29 * ({SEA})},
30 * series = {LNCS},
31 * volume = {7933},
32 * pages = {151--163},
33 * publisher = {Springer},
34 * year = {2013},
35 * }
36 */
37
38#pragma once
39
40#include "pasta/bit_vector/bit_vector.hpp"
41#include "pasta/bit_vector/support/l12_type.hpp"
42#include "pasta/bit_vector/support/optimized_for.hpp"
43#include "pasta/bit_vector/support/popcount.hpp"
44#include "pasta/bit_vector/support/rank.hpp"
45#include "pasta/bit_vector/support/select.hpp"
46
47#include <cstddef>
48#include <cstdint>
49#include <limits>
50#include <tlx/container/simple_vector.hpp>
51#include <vector>
52
53namespace pasta {
54
55//! \addtogroup pasta_bit_vector_rank_select
56//! \{
57
58/*!
59 * \brief Select support for \c BitVector
60 *
61 * The rank and select support is based on popcount and described in detail by
62 * Zhou et al. \cite ZhouAK2013PopcountRankSelect. The data structure consists
63 * of three levels (L0, L1, and L2) that contain different information
64 * regarding the popcount of the current block or all previous blocks.
65 *
66 * Since the rank data structures used by rank are a strict subset of the data
67 * structures required by select, we provide a (ever so slightly) more
68 * space-efficient rank only data structure in \c BitVectorRank.
69 *
70 * \tparam OptimizedFor Compile time option to optimize data structure for
71 * either 0, 1, or no specific type of query.
72 * \tparam VectorType Type of the vector the rank and select data structure is
73 * constructed for, e.g., plain \c BitVector or a compressed bit vector.
74 */
75template <OptimizedFor optimized_for = OptimizedFor::DONT_CARE,
76 typename VectorType = BitVector>
77class RankSelect final : public Rank<optimized_for, VectorType> {
78 //! Get access to protected members of base class, as dependent
79 //! names are not considered.
80 using Rank<optimized_for>::data_size_;
81 //! Get access to protected members of base class, as dependent
82 //! names are not considered.
83 using Rank<optimized_for>::data_;
84 //! Get access to protected members of base class, as dependent
85 //! names are not considered.
86 using Rank<optimized_for>::l0_;
87 //! Get access to protected members of base class, as dependent
88 //! names are not considered.
89 using Rank<optimized_for>::l12_;
90
91 template <typename T>
92 using Array = tlx::SimpleVector<T, tlx::SimpleVectorMode::NoInitNoDestroy>;
93
94 // Members for the structure (needed only for select)
95 //! Staring positions of the samples of zeros (w.r.t. L0-blocks)
96 Array<uint64_t> samples0_pos_;
97 //! Staring positions of the samples of ones (w.r.t. L0-blocks)
98 Array<uint64_t> samples1_pos_;
99 //! Positions of every \c SELECT_SAMPLE_RATE zero.
100 std::vector<uint32_t> samples0_;
101 //! Positions of every \c SELECT_SAMPLE_RATE one.
102 std::vector<uint32_t> samples1_;
103
104public:
105 //! Default constructor w/o parameter.
106 RankSelect() = default;
107
108 /*!
109 * \brief Constructor. Creates the auxiliary information for
110 * efficient rank and select queries.
111 *
112 * \param bv Vector of \c VectorType the rank and select structure is created
113 * for.
114 */
115 RankSelect(VectorType& bv)
116 : Rank<optimized_for>(bv),
117 samples0_pos_((data_size_ / PopcntRankSelectConfig::L0_WORD_SIZE) + 1),
118 samples1_pos_((data_size_ / PopcntRankSelectConfig::L0_WORD_SIZE) + 1) {
119 init();
120 }
121
122 //! Default move constructor.
123 RankSelect(RankSelect&& other) = default;
124
125 //! Default move assignment.
126 RankSelect& operator=(RankSelect&& other) = default;
127
128 //! Destructor. Deleting manually created arrays.
129 ~RankSelect() = default;
130
131 /*!
132 * \brief Get position of specific zero, i.e., select.
133 * \param rank Rank of zero the position is searched for.
134 * \return Position of the rank-th zero.
135 */
136 [[nodiscard("select0 computed but not used")]] size_t
137 select0(size_t rank) const {
138 size_t const l0_end = l0_.size();
139 size_t const l12_end = l12_.size();
140 size_t l0_pos = 0;
141
142 if constexpr (optimize_one_or_dont_care(optimized_for)) {
143 while (l0_pos + 1 < l0_end &&
144 ((l0_pos + 1) * PopcntRankSelectConfig::L0_BIT_SIZE) -
145 l0_[l0_pos + 1] <
146 rank) {
147 ++l0_pos;
148 }
149 } else {
150 while (l0_pos + 1 < l0_end && l0_[l0_pos + 1] < rank) {
151 ++l0_pos;
152 }
153 }
154 if (l0_pos == l0_end) [[unlikely]] {
155 return data_size_;
156 }
157 if constexpr (optimize_one_or_dont_care(optimized_for)) {
158 rank -= (l0_pos * PopcntRankSelectConfig::L0_BIT_SIZE) - l0_[l0_pos];
159 } else {
160 rank -= l0_[l0_pos];
161 }
162
163 size_t const sample_pos =
165 samples0_pos_[l0_pos];
166
167 size_t l1_pos = samples0_[sample_pos];
168 l1_pos += ((rank - 1) % PopcntRankSelectConfig::SELECT_SAMPLE_RATE) /
170 size_t const l0_block_end =
171 std::min<size_t>(
172 ((l0_pos + 1) * (PopcntRankSelectConfig::L0_WORD_SIZE /
174 l12_end) -
175 1;
176 l1_pos = std::min<size_t>(l1_pos, l0_block_end);
177 size_t l2_pos = 0;
178 if constexpr (optimize_one_or_dont_care(optimized_for)) {
179 while (l1_pos + 1 < l0_block_end &&
180 ((l1_pos + 1) * PopcntRankSelectConfig::L1_BIT_SIZE) -
181 l12_[l1_pos + 1].l1 <
182 rank) {
183 ++l1_pos;
184 }
185 rank -= (l1_pos * PopcntRankSelectConfig::L1_BIT_SIZE) -
186 (l0_pos * PopcntRankSelectConfig::L0_BIT_SIZE) - l12_[l1_pos].l1;
187
188 auto l2 = l12_[l1_pos].l2_values;
189 while (l2_pos < 3 && PopcntRankSelectConfig::L2_BIT_SIZE -
190 (l2 & uint16_t(0b1111111111)) <
191 rank) {
192 rank -=
193 PopcntRankSelectConfig::L2_BIT_SIZE - (l2 & uint16_t(0b1111111111));
194 l2 >>= 10;
195 ++l2_pos;
196 }
197 } else {
198 while (l1_pos + 1 < l0_block_end && l12_[l1_pos + 1].l1 < rank) {
199 ++l1_pos;
200 }
201 rank -= l12_[l1_pos].l1;
202 auto l2 = l12_[l1_pos].l2_values;
203 while (l2_pos < 3 && (l2 & uint16_t(0b1111111111)) < rank) {
204 rank -= (l2 & uint16_t(0b1111111111));
205 l2 >>= 10;
206 ++l2_pos;
207 }
208 }
209
210 size_t last_pos = (PopcntRankSelectConfig::L2_WORD_SIZE * l2_pos) +
212 size_t popcount = 0;
213
214 while ((popcount = pasta::popcount_zeros<1>(data_ + last_pos)) < rank) {
215 ++last_pos;
216 rank -= popcount;
217 }
218
219 return (last_pos * 64) + select(~data_[last_pos], rank - 1);
220 }
221
222 /*!
223 * \brief Get position of specific one, i.e., select.
224 * \param rank Rank of one the position is searched for.
225 * \return Position of the rank-th one.
226 */
227 [[nodiscard("select1 computed but not used")]] size_t
228 select1(size_t rank) const {
229 size_t const l0_end = l0_.size();
230 size_t const l12_end = l12_.size();
231 size_t l0_pos = 0;
232
233 if constexpr (optimize_one_or_dont_care(optimized_for)) {
234 while (l0_pos + 1 < l0_end && l0_[l0_pos + 1] < rank) {
235 ++l0_pos;
236 }
237 } else {
238 while (l0_pos + 1 < l0_end &&
239 ((l0_pos + 1) * PopcntRankSelectConfig::L0_BIT_SIZE) -
240 l0_[l0_pos + 1] <
241 rank) {
242 ++l0_pos;
243 }
244 }
245 if (l0_pos == l0_end) [[unlikely]] {
246 return data_size_;
247 }
248 if constexpr (optimize_one_or_dont_care(optimized_for)) {
249 rank -= l0_[l0_pos];
250 } else {
251 rank -= (l0_pos * PopcntRankSelectConfig::L0_BIT_SIZE) - l0_[l0_pos];
252 }
253
254 size_t const sample_pos =
256 samples1_pos_[l0_pos];
257
258 size_t l1_pos = samples1_[sample_pos];
259 l1_pos += ((rank - 1) % PopcntRankSelectConfig::SELECT_SAMPLE_RATE) /
261 size_t const l0_block_end =
262 std::min<size_t>(
263 ((l0_pos + 1) * (PopcntRankSelectConfig::L0_WORD_SIZE /
265 l12_end) -
266 1;
267 l1_pos = std::min(l1_pos, l0_block_end);
268 size_t l2_pos = 0;
269 if constexpr (optimize_one_or_dont_care(optimized_for)) {
270 while (l1_pos + 1 < l0_block_end && l12_[l1_pos + 1].l1 < rank) {
271 ++l1_pos;
272 }
273 rank -= l12_[l1_pos].l1;
274 auto l2 = l12_[l1_pos].l2_values;
275 while (l2_pos < 3 && (l2 & uint16_t(0b1111111111)) < rank) {
276 rank -= (l2 & uint16_t(0b1111111111));
277 l2 >>= 10;
278 ++l2_pos;
279 }
280 } else {
281 while (l1_pos + 1 < l0_block_end &&
282 ((l1_pos + 1) * PopcntRankSelectConfig::L1_BIT_SIZE) -
283 l12_[l1_pos + 1].l1 <
284 rank) {
285 ++l1_pos;
286 }
287 rank -= (l1_pos * PopcntRankSelectConfig::L1_BIT_SIZE) -
288 (l0_pos * PopcntRankSelectConfig::L0_BIT_SIZE) - l12_[l1_pos].l1;
289
290 auto l2 = l12_[l1_pos].l2_values;
291 while (l2_pos < 3 && PopcntRankSelectConfig::L2_BIT_SIZE -
292 (l2 & uint16_t(0b1111111111)) <
293 rank) {
294 rank -=
295 PopcntRankSelectConfig::L2_BIT_SIZE - (l2 & uint16_t(0b1111111111));
296 l2 >>= 10;
297 ++l2_pos;
298 }
299 }
300
301 size_t last_pos = (PopcntRankSelectConfig::L2_WORD_SIZE * l2_pos) +
303 size_t popcount = 0;
304
305 while ((popcount = pasta::popcount<1>(data_ + last_pos)) < rank) {
306 ++last_pos;
307 rank -= popcount;
308 }
309 return (last_pos * 64) + select(data_[last_pos], rank - 1);
310 }
311
312 /*!
313 * \brief Estimate for the space usage.
314 * \return Number of bytes used by this data structure.
315 */
316 [[nodiscard("space usage computed but not used")]] size_t
317 space_usage() const final {
318 return samples0_.size() * sizeof(uint32_t) +
319 samples1_.size() * sizeof(uint32_t) +
320 samples0_pos_.size() * sizeof(uint64_t) +
321 samples1_pos_.size() * sizeof(uint64_t) + sizeof(*this);
322 }
323
324private:
325 //! Function used initializing data structure to reduce LOCs of constructor.
326 void init() {
327 size_t const l12_end = l12_.size();
328
329 size_t next_sample0_value = 1;
330 size_t next_sample1_value = 1;
331 for (size_t l0_pos = 0, l12_pos = 0; l12_pos < l12_end; ++l12_pos) {
334 0) [[unlikely]] {
335 samples0_pos_[l0_pos] = samples0_.size();
336 samples1_pos_[l0_pos++] = samples1_.size();
337 next_sample0_value = 1;
338 next_sample1_value = 1;
339 }
340 if constexpr (optimize_one_or_dont_care(optimized_for)) {
342 ((l0_pos - 1) * PopcntRankSelectConfig::L0_BIT_SIZE) -
343 l12_[l12_pos].l1 >=
344 next_sample0_value) {
345 samples0_.push_back(l12_pos - 1);
346 next_sample0_value += PopcntRankSelectConfig::SELECT_SAMPLE_RATE;
347 }
348 if (l12_[l12_pos].l1 >= next_sample1_value) {
349 samples1_.push_back(l12_pos - 1);
350 next_sample1_value += PopcntRankSelectConfig::SELECT_SAMPLE_RATE;
351 }
352 } else {
353 if (l12_[l12_pos].l1 >= next_sample0_value) {
354 samples0_.push_back(l12_pos - 1);
355 next_sample0_value += PopcntRankSelectConfig::SELECT_SAMPLE_RATE;
356 }
358 ((l0_pos - 1) * PopcntRankSelectConfig::L0_BIT_SIZE) -
359 l12_[l12_pos].l1 >=
360 next_sample1_value) {
361 samples1_.push_back(l12_pos - 1);
362 next_sample1_value += PopcntRankSelectConfig::SELECT_SAMPLE_RATE;
363 }
364 }
365 }
366 // Add at least one entry.
367 if (samples0_.size() == 0) [[unlikely]] {
368 samples0_.push_back(0);
369 }
370 if (samples1_.size() == 0) [[unlikely]] {
371 samples1_.push_back(0);
372 }
373 }
374}; // class RankSelect
375
376//! \}
377
378} // namespace pasta
379
380/******************************************************************************/
Select support for BitVector.
Definition: rank_select.hpp:77
RankSelect & operator=(RankSelect &&other)=default
Default move assignment.
RankSelect()=default
Default constructor w/o parameter.
~RankSelect()=default
Destructor. Deleting manually created arrays.
size_t select1(size_t rank) const
Get position of specific one, i.e., select.
Definition: rank_select.hpp:228
RankSelect(VectorType &bv)
Constructor. Creates the auxiliary information for efficient rank and select queries.
Definition: rank_select.hpp:115
size_t space_usage() const final
Estimate for the space usage.
Definition: rank_select.hpp:317
RankSelect(RankSelect &&other)=default
Default move constructor.
size_t select0(size_t rank) const
Get position of specific zero, i.e., select.
Definition: rank_select.hpp:137
Rank support for BitVector.
Definition: rank.hpp:102
tlx::SimpleVector< uint64_t, tlx::SimpleVectorMode::NoInitNoDestroy > l0_
Array containing the number of set bits in the L0-blocks.
Definition: rank.hpp:112
VectorType::RawDataConstAccess data_
Pointer to the data of the bit vector.
Definition: rank.hpp:107
size_t data_size_
Size of the bit vector the rank support is constructed for.
Definition: rank.hpp:105
tlx::SimpleVector< L12Type, tlx::SimpleVectorMode::NoInitNoDestroy > l12_
Array containing the information about the L1- and L2-blocks.
Definition: rank.hpp:115
OptimizedFor
Enum used to specify which queries the rank (and select) data structures should be optimized for.
Definition: optimized_for.hpp:40
constexpr bool optimize_one_or_dont_care(OptimizedFor const optimized_for)
Helper function indicating if queries should be optimized for one queries or the caller does not care...
Definition: optimized_for.hpp:59
@ DONT_CARE
It does not matter (both types are called equally often).
Static configuration for Rank and RankSelect.
Definition: rank.hpp:60
static constexpr size_t L0_WORD_SIZE
Number of 64-bit words covered by an L0-block.
Definition: rank.hpp:74
static constexpr size_t SELECT_SAMPLE_RATE
Sample rate of positions for faster select queries.
Definition: rank.hpp:77
static constexpr size_t L2_WORD_SIZE
Number of 64-bit words covered by an L2-block.
Definition: rank.hpp:70
static constexpr size_t L2_BIT_SIZE
Bits covered by an L2-block.
Definition: rank.hpp:62
static constexpr size_t L0_BIT_SIZE
Bits covered by an L0-block.
Definition: rank.hpp:66
static constexpr size_t L1_BIT_SIZE
Bits covered by an L1-block.
Definition: rank.hpp:64
static constexpr size_t L1_WORD_SIZE
Number of 64-bit words covered by an L1-block.
Definition: rank.hpp:72