SeqAn3 3.2.0-rc.1
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <algorithm>
16#include <filesystem>
17#include <seqan3/std/ranges>
18
19#include <sdsl/suffix_trees.hpp>
20
26
27namespace seqan3::detail
28{
33{
48 template <semialphabet alphabet_t, text_layout text_layout_mode_, std::ranges::range text_t>
49 static void validate(text_t && text)
50 {
51 if constexpr (text_layout_mode_ == text_layout::single)
52 {
53 static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
54 static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
55 "The alphabet of the text collection must be convertible to the alphabet of the index.");
56 static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
57
58 if (std::ranges::empty(text))
59 throw std::invalid_argument("The text to index cannot be empty.");
60 }
61 else
62 {
63 static_assert(std::ranges::bidirectional_range<text_t>,
64 "The text collection must model bidirectional_range.");
65 static_assert(std::ranges::bidirectional_range<std::ranges::range_reference_t<text_t>>,
66 "The elements of the text collection must model bidirectional_range.");
67 static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
68 "The alphabet of the text collection must be convertible to the alphabet of the index.");
69 static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
70
71 if (std::ranges::empty(text))
72 throw std::invalid_argument("The text collection to index cannot be empty.");
73 }
74 static_assert(alphabet_size<range_innermost_value_t<text_t>> <= 256, "The alphabet is too big.");
75 }
76};
77} // namespace seqan3::detail
78
79namespace seqan3
80{
81
83// forward declarations
84template <typename index_t>
85class fm_index_cursor;
86
87template <typename index_t>
88class bi_fm_index_cursor;
89
90namespace detail
91{
92template <semialphabet alphabet_t,
93 text_layout text_layout_mode_,
94 detail::sdsl_index sdsl_index_type_>
95class reverse_fm_index;
96}
98
132 sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector, // Wavelet tree type
133 sdsl::rank_support_v<>,
134 sdsl::select_support_scan<>,
135 sdsl::select_support_scan<0>>,
136 16, // Sampling rate of the suffix array
137 10'000'000, // Sampling rate of the inverse suffix array
138 sdsl::sa_order_sa_sampling<>, // How to sample positions in the suffix array (text VS SA sampling)
139 sdsl::isa_sampling<>, // How to sample positons in the inverse suffix array
140 sdsl::plain_byte_alphabet>; // How to represent the alphabet
141
148
187template <semialphabet alphabet_t,
188 text_layout text_layout_mode_,
189 detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
191{
192private:
197 using sdsl_index_type = sdsl_index_type_;
201 using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
203 using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
205
206 friend class detail::reverse_fm_index<alphabet_t, text_layout_mode_, sdsl_index_type_>;
207
210
212 sdsl::sd_vector<> text_begin;
214 sdsl::select_support_sd<1> text_begin_ss;
216 sdsl::rank_support_sd<1> text_begin_rs;
217
219 template <typename output_it_t, typename sequence_t>
220 static output_it_t copy_sequence_ranks_shifted_by_one(output_it_t output_it, sequence_t && sequence)
221 {
222 constexpr size_t sigma = alphabet_size<alphabet_t>;
223 constexpr size_t max_sigma = text_layout_mode_ == text_layout::single ? 256u : 255u;
224
225 constexpr auto warn_if_rank_out_of_range = [](uint8_t const rank)
226 {
227 if (rank >= max_sigma - 1) // same as rank + 1 >= max_sigma but without overflow
228 throw std::out_of_range("The input text cannot be indexed, because for full"
229 "character alphabets the last one/two values are reserved"
230 "(single sequence/collection).");
231 };
232
233 return std::ranges::transform(sequence, output_it, [&warn_if_rank_out_of_range](auto const & chr)
234 {
235 uint8_t const rank = seqan3::to_rank(chr);
236 if constexpr (sigma >= max_sigma)
237 warn_if_rank_out_of_range(rank);
238 return rank + 1;
239 }).out;
240 }
241
261 template <std::ranges::range text_t>
263 requires (text_layout_mode_ == text_layout::single)
265 void construct(text_t && text)
266 {
267 detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
268
269 // TODO:
270 // * check what happens in sdsl when constructed twice!
271 // * choose between in-memory/external and construction algorithms
272 // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
273 // uint8_t largest_char = 0;
274 sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
275
276 // copy ranks into tmp_text
277 copy_sequence_ranks_shifted_by_one(std::ranges::begin(tmp_text), text | std::views::reverse);
278
279 sdsl::construct_im(index, tmp_text, 0);
280
281 // TODO: would be nice but doesn't work since it's private and the public member references are const
282 // index.m_C.resize(largest_char);
283 // index.m_C.shrink_to_fit();
284 // index.m_sigma = largest_char;
285 }
286
288 template <std::ranges::range text_t>
290 requires (text_layout_mode_ == text_layout::collection)
292 void construct(text_t && text, bool reverse = false)
293 {
294 detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
295
296 std::vector<size_t> text_sizes;
297
298 for (auto && t : text)
299 text_sizes.push_back(std::ranges::distance(t));
300
301 size_t const number_of_texts{text_sizes.size()};
302
303 // text size including delimiters
304 size_t const text_size = std::accumulate(text_sizes.begin(), text_sizes.end(), number_of_texts);
305
306 if (number_of_texts == text_size)
307 throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
308
309 constexpr auto sigma = alphabet_size<alphabet_t>;
310
311 // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
312 // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
313 // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
314 sdsl::sd_vector_builder builder(text_size, number_of_texts);
315 size_t prefix_sum{0};
316
317 for (auto && size : text_sizes)
318 {
319 builder.set(prefix_sum);
320 prefix_sum += size + 1;
321 }
322
323 text_begin = sdsl::sd_vector<>(builder);
324 text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
325 text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
326
327 // last text in collection needs no delimiter if we have more than one text in the collection
328 sdsl::int_vector<8> tmp_text(text_size - (number_of_texts > 1));
329
330 constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
331
332 auto copy_join_with = [](auto output_it, auto && collection)
333 {
334 // this is basically std::views::join() with a delimiter
335 auto collection_it = std::ranges::begin(collection);
336 auto const collection_sentinel = std::ranges::end(collection);
337 if (collection_it == collection_sentinel)
338 return;
339
340 output_it = copy_sequence_ranks_shifted_by_one(output_it, *collection_it);
341 ++collection_it;
342
343 for (; collection_it != collection_sentinel; ++collection_it)
344 {
345 *output_it = delimiter;
346 ++output_it;
347 output_it = copy_sequence_ranks_shifted_by_one(output_it, *collection_it);
348 }
349 };
350
351 // copy ranks into tmp_text
352 copy_join_with(std::ranges::begin(tmp_text), text);
353
354 if (!reverse)
355 {
356 // we need at least one delimiter
357 if (number_of_texts == 1)
358 tmp_text.back() = delimiter;
359
360 std::ranges::reverse(tmp_text);
361 }
362 else
363 {
364 // If only one text is in the text collection, we still need one delimiter at the end to be able to
365 // conduct rank and select queries when locating hits in the index.
366 // Also, tmp_text looks like [text|0], but after reversing we need [txet|0] to be able to add the delimiter.
367 if (number_of_texts == 1)
368 {
369 std::ranges::reverse(tmp_text.begin(), tmp_text.end() - 1);
370 tmp_text.back() = delimiter;
371 }
372 else
373 {
374 std::ranges::reverse(tmp_text);
375 }
376 }
377
378 sdsl::construct_im(index, tmp_text, 0);
379 }
380
381public:
383 static constexpr text_layout text_layout_mode = text_layout_mode_;
384
389 using alphabet_type = alphabet_t;
391 using size_type = typename sdsl_index_type::size_type;
395
396 template <typename bi_fm_index_t>
397 friend class bi_fm_index_cursor;
398
399 template <typename fm_index_t>
400 friend class fm_index_cursor;
401
402 template <typename fm_index_t>
403 friend class detail::fm_index_cursor_node;
404
408 fm_index() = default;
409
411 fm_index(fm_index const & rhs) :
413 {
414 text_begin_ss.set_vector(&text_begin);
415 text_begin_rs.set_vector(&text_begin);
416 }
417
420 index{std::move(rhs.index)}, text_begin{std::move(rhs.text_begin)},text_begin_ss{std::move(rhs.text_begin_ss)},
422 {
423 text_begin_ss.set_vector(&text_begin);
424 text_begin_rs.set_vector(&text_begin);
425 }
426
429 {
430 index = std::move(rhs.index);
431 text_begin = std::move(rhs.text_begin);
432 text_begin_ss = std::move(rhs.text_begin_ss);
433 text_begin_rs = std::move(rhs.text_begin_rs);
434
435 text_begin_ss.set_vector(&text_begin);
436 text_begin_rs.set_vector(&text_begin);
437
438 return *this;
439 }
440
441 ~fm_index() = default;
442
451 template <std::ranges::bidirectional_range text_t>
452 explicit fm_index(text_t && text)
453 {
454 construct(std::forward<text_t>(text));
455 }
457
469 size_type size() const noexcept
470 {
471 return index.size();
472 }
473
485 bool empty() const noexcept
486 {
487 return size() == 0;
488 }
489
501 bool operator==(fm_index const & rhs) const noexcept
502 {
503 // (void) rhs;
504 return (index == rhs.index) && (text_begin == rhs.text_begin);
505 }
506
518 bool operator!=(fm_index const & rhs) const noexcept
519 {
520 return !(*this == rhs);
521 }
522
537 cursor_type cursor() const noexcept
538 {
539 return {*this};
540 }
541
549 template <cereal_archive archive_t>
550 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
551 {
552 archive(index);
553 archive(text_begin);
554 archive(text_begin_ss);
555 text_begin_ss.set_vector(&text_begin);
556 archive(text_begin_rs);
557 text_begin_rs.set_vector(&text_begin);
558
559 auto sigma = alphabet_size<alphabet_t>;
560 archive(sigma);
561 if (sigma != alphabet_size<alphabet_t>)
562 {
563 throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma) +
564 " but it is being read into an fm_index with an alphabet of size " +
565 std::to_string(alphabet_size<alphabet_t>) + "."};
566 }
567
568 bool tmp = text_layout_mode;
569 archive(tmp);
570 if (tmp != text_layout_mode)
571 {
572 throw std::logic_error{std::string{"The fm_index was built over a "} +
573 (tmp ? "text collection" : "single text") +
574 " but it is being read into an fm_index expecting a " +
575 (text_layout_mode ? "text collection." : "single text.")};
576 }
577 }
579
580};
581
586template <std::ranges::range text_t>
587fm_index(text_t &&) -> fm_index<range_innermost_value_t<text_t>, text_layout{range_dimension_v<text_t> != 1}>;
589} // namespace seqan3
590
591namespace seqan3::detail
592{
593
607template <semialphabet alphabet_t,
608 text_layout text_layout_mode,
609 detail::sdsl_index sdsl_index_type = default_sdsl_index_type>
610class reverse_fm_index : public fm_index<alphabet_t, text_layout_mode, sdsl_index_type>
611{
612private:
614 template <std::ranges::range text_t>
615 void construct_(text_t && text)
616 {
617 if constexpr (text_layout_mode == text_layout::single)
618 {
619 auto reverse_text = text | std::views::reverse;
620 this->construct(reverse_text);
621 }
622 else
623 {
624 auto reverse_text = text | views::deep{std::views::reverse} | std::views::reverse;
625 this->construct(reverse_text, true);
626 }
627 }
628
629public:
631
633 template <std::ranges::bidirectional_range text_t>
634 explicit reverse_fm_index(text_t && text)
635 {
636 construct_(std::forward<text_t>(text));
637 }
638
639};
640
641} // namespace seqan3::detail
T accumulate(T... args)
T begin(T... args)
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:55
An FM Index specialisation that handles reversing the given text.
Definition: fm_index.hpp:611
void construct_(text_t &&text)
Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has t...
Definition: fm_index.hpp:615
reverse_fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:634
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:87
The SeqAn FM Index.
Definition: fm_index.hpp:191
sdsl_index_type_ sdsl_index_type
The type of the underlying SDSL index.
Definition: fm_index.hpp:197
fm_index & operator=(fm_index rhs)
When copy/move assigning, also update internal data structures.
Definition: fm_index.hpp:428
fm_index()=default
Defaulted.
static constexpr text_layout text_layout_mode
Indicates whether index is built over a collection.
Definition: fm_index.hpp:383
cursor_type cursor() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching. Cursor is pointing to...
Definition: fm_index.hpp:537
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:469
sdsl::rank_support_sd< 1 > text_begin_rs
Rank support for text_begin.
Definition: fm_index.hpp:216
sdsl_index_type index
Underlying index from the SDSL.
Definition: fm_index.hpp:209
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:518
typename sdsl_index_type::alphabet_type::sigma_type sdsl_sigma_type
The type of the alphabet size of the underlying SDSL index.
Definition: fm_index.hpp:203
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:485
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:501
typename sdsl_index_type::alphabet_type::char_type sdsl_char_type
The type of the reduced alphabet type. (The reduced alphabet might be smaller than the original alpha...
Definition: fm_index.hpp:201
~fm_index()=default
Defaulted.
alphabet_t alphabet_type
The type of the underlying character of the indexed text.
Definition: fm_index.hpp:389
fm_index(fm_index &&rhs)
When move constructing, also update internal data structures.
Definition: fm_index.hpp:419
void construct(text_t &&text, bool reverse=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index.hpp:292
void construct(text_t &&text)
Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has t...
Definition: fm_index.hpp:265
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:391
static output_it_t copy_sequence_ranks_shifted_by_one(output_it_t output_it, sequence_t &&sequence)
Eagerly convert sequence into ranks, shift by one and copy them into output_it.
Definition: fm_index.hpp:220
sdsl::sd_vector text_begin
Bitvector storing begin positions for collections.
Definition: fm_index.hpp:212
sdsl::select_support_sd< 1 > text_begin_ss
Select support for text_begin.
Definition: fm_index.hpp:214
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:452
fm_index(fm_index const &rhs)
When copy constructing, also update internal data structures.
Definition: fm_index.hpp:411
A wrapper type around an existing view adaptor that enables "deep view" behaviour for that view.
Definition: deep.hpp:104
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T end(T... args)
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:72
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:147
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10 '000 '000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:140
@ single
The text is a single range.
Definition: concept.hpp:74
@ collection
The text is a range of ranges.
Definition: concept.hpp:76
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:495
The basis for seqan3::alphabet, but requires only rank interface (not char).
The generic concept for a (biological) sequence.
The internal SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
fm_index(text_t &&) -> fm_index< range_innermost_value_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition: fm_index.hpp:587
SeqAn specific customisations in the standard namespace.
#define CEREAL_SERIALIZE_FUNCTION_NAME
Macro for Cereal's serialize function.
Definition: platform.hpp:126
T push_back(T... args)
The <ranges> header from C++20's standard library.
Provides the concept for seqan3::detail::sdsl_index.
T size(T... args)
Internal representation of the node of an FM index cursor.
Definition: fm_index_cursor.hpp:30
Class used to validate the requirements on the input text of the fm_index.
Definition: fm_index.hpp:33
static void validate(text_t &&text)
Validates the fm_index template parameters and text.
Definition: fm_index.hpp:49
Provides seqan3::views::to_rank.
T to_string(T... args)