Bifrost
CompactedDBG.hpp
Go to the documentation of this file.
1 #ifndef BFG_COMPACTED_DBG_HPP
2 #define BFG_COMPACTED_DBG_HPP
3 
4 #include <cmath>
5 #include <cstdlib>
6 #include <cstring>
7 #include <ctime>
8 #include <cstdio>
9 #include <climits>
10 #include <functional>
11 #include <getopt.h>
12 #include <iostream>
13 #include <sstream>
14 #include <stdint.h>
15 #include <string>
16 #include <unordered_map>
17 #include <unordered_set>
18 #include <vector>
19 
20 #include <thread>
21 #include <atomic>
22 #include <mutex>
23 
24 #include "BlockedBloomFilter.hpp"
25 #include "Common.hpp"
26 #include "File_Parser.hpp"
27 #include "FASTX_Parser.hpp"
28 #include "GFA_Parser.hpp"
29 #include "Kmer.hpp"
30 #include "KmerHashTable.hpp"
31 #include "KmerIterator.hpp"
32 #include "KmerStream.hpp"
33 #include "Lock.hpp"
34 #include "minHashIterator.hpp"
35 #include "RepHash.hpp"
36 #include "TinyVector.hpp"
37 #include "Unitig.hpp"
38 #include "UnitigIterator.hpp"
39 #include "UnitigMap.hpp"
40 
41 #include "roaring.hh"
42 
43 #define MASK_CONTIG_ID (0xffffffff00000000)
44 #define MASK_CONTIG_TYPE (0x80000000)
45 #define MASK_CONTIG_POS (0x7fffffff)
46 #define RESERVED_ID (0xffffffff)
47 
48 #define DEFAULT_K 31
49 #define DEFAULT_G 23
50 
56 using namespace std;
57 
125 
126  bool verbose;
127  size_t nb_threads;
129 
130  //size_t nb_unique_kmers;
131  //size_t nb_non_unique_kmers;
134 
137 
138  vector<string> filename_seq_in;
139  vector<string> filename_ref_in;
140 
141  // The following members are not used by CompactedDBG<U, G>::build
142  // but you can set them to use them as parameters for other functions
143  // such as CompactedDBG<U, G>::simplify, CompactedDBG<U, G>::read or
144  // CompactedDBG<U, G>::write.
145 
146  size_t k, g;
147 
148  bool build;
149  bool update;
150 
151  bool clipTips;
154 
155  bool outputGFA;
156 
158 
160 
161  CDBG_Build_opt() : nb_threads(1), k(DEFAULT_K), g(DEFAULT_G), /*nb_unique_kmers(0), nb_non_unique_kmers(0),*/
162  nb_bits_unique_kmers_bf(14), nb_bits_non_unique_kmers_bf(14), read_chunksize(64),
163  build(false), update(false), clipTips(false), deleteIsolated(false), useMercyKmers(false),
164  outputGFA(true), verbose(false) {}
165 };
166 
172 template<typename U, typename G> using const_UnitigMap = UnitigMap<U, G, true>;
173 
203 template<typename Unitig_data_t, typename Graph_data_t = void> //Curiously Recurring Template Pattern (CRTP)
204 class CDBG_Data_t {
205 
206  public:
207 
214 
229 
241 
256  void extract(const UnitigMap<Unitig_data_t, Graph_data_t>& um_src, bool last_extraction){}
257 
267 
268  return string();
269  }
270 };
271 
296 template<typename Unitig_data_t = void, typename Graph_data_t = void>
298 
299  static_assert(is_void<Unitig_data_t>::value || is_base_of<CDBG_Data_t<Unitig_data_t, Graph_data_t>, Unitig_data_t>::value,
300  "Type of data associated with vertices of class CompactedDBG must be void (no data) or a class extending class CDBG_Data_t");
301 
302  typedef Unitig_data_t U;
303  typedef Graph_data_t G;
304 
305  public:
306 
307  template<typename U, typename G, bool is_const> friend class UnitigMap;
308  template<typename U, typename G, bool is_const> friend class unitigIterator;
309  template<typename U, typename G, bool is_const> friend class neighborIterator;
310 
318  CompactedDBG(const int kmer_length = DEFAULT_K, const int minimizer_length = DEFAULT_G);
319 
325  CompactedDBG(const CompactedDBG& o); // Copy constructor
326 
332  CompactedDBG(CompactedDBG&& o); // Move constructor
333 
336  virtual ~CompactedDBG();
337 
344  CompactedDBG<U, G>& operator=(const CompactedDBG& o);
345 
352  CompactedDBG<U, G>& operator=(CompactedDBG&& o);
353 
365  CompactedDBG<U, G>& operator+=(const CompactedDBG& o);
366 
371  bool operator==(const CompactedDBG& o) const;
372 
377  inline bool operator!=(const CompactedDBG& o) const;
378 
381  void clear();
382 
387  bool build(CDBG_Build_opt& opt);
388 
395  bool simplify(const bool delete_short_isolated_unitigs = true, const bool clip_short_tips = true, const bool verbose = false);
396 
404  bool write(const string& output_filename, const size_t nb_threads = 1, const bool GFA_output = true, const bool verbose = false) const;
405 
414  bool read(const string& input_filename, const bool verbose = false);
415 
423  UnitigMap<U, G> find(const Kmer& km, const bool extremities_only = false);
424 
432  const_UnitigMap<U, G> find(const Kmer& km, const bool extremities_only = false) const;
433 
442  UnitigMap<U, G> findUnitig(const char* s, const size_t pos, const size_t len);
443 
456  vector<pair<size_t, UnitigMap<U, G>>> searchSequence( const string& seq, const bool exact, const bool insertion, const bool deletion,
457  const bool substitution, const bool or_exclusive_match = false) const;
458 
466  bool add(const string& seq, const bool verbose = false);
467 
473  bool remove(const const_UnitigMap<U, G>& um, const bool verbose = false);
474 
487  bool merge(const CompactedDBG& o, const size_t nb_threads = 1, const bool verbose = false);
488 
499  bool merge(const vector<CompactedDBG>& v, const size_t nb_threads = 1, const bool verbose = false);
500 
504  iterator begin();
505 
509  const_iterator begin() const;
510 
514  iterator end();
515 
519  const_iterator end() const;
520 
524  size_t length() const;
525 
526  size_t nbKmers() const;
527 
531  inline bool isInvalid() const { return invalid; }
532 
536  inline int getK() const { return k_; }
537 
541  inline size_t size() const { return v_unitigs.size() + v_kmers.size() + h_kmers_ccov.size(); }
542 
546  inline G* getData() { return data.getData(); }
547 
551  inline const G* getData() const { return data.getData(); }
552 
553  protected:
554 
555  bool annotateSplitUnitigs(const CompactedDBG<U, G>& o, const size_t nb_threads = 1, const bool verbose = false);
556 
557  pair<size_t, size_t> splitAllUnitigs();
558  pair<size_t, size_t> getSplitInfoAllUnitigs() const;
559 
560  inline size_t joinUnitigs(vector<Kmer>* v_joins = nullptr, const size_t nb_threads = 1) {
561 
562  return joinUnitigs_<is_void<U>::value>(v_joins, nb_threads);
563  }
564 
565  bool mergeData(const CompactedDBG<U, G>& o, const size_t nb_threads = 1, const bool verbose = false);
566  bool mergeData(CompactedDBG<U, G>&& o, const size_t nb_threads = 1, const bool verbose = false);
567 
568  private:
569 
570  bool filter(const CDBG_Build_opt& opt, const size_t nb_unique_kmers, const size_t nb_non_unique_kmers);
571  bool construct(const CDBG_Build_opt& opt, const size_t nb_unique_minimizers, const size_t nb_non_unique_minimizers);
572 
573  bool addUnitigSequenceBBF(const Kmer km, const string& seq, const size_t pos_match_km, const size_t len_match_km, LockGraph& lck_g);
574 
575  size_t findUnitigSequenceBBF(Kmer km, string& s, bool& isIsolated, vector<Kmer>& l_ignored_km_tip);
576  bool bwStepBBF(const Kmer km, Kmer& front, char& c, bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip, const bool check_fp_cand = true) const;
577  bool fwStepBBF(const Kmer km, Kmer& end, char& c, bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip, const bool check_fp_cand = true) const;
578 
579  inline size_t find(const preAllocMinHashIterator<RepHash>& it_min_h) const {
580 
581  const int pos = it_min_h.getPosition();
582  return (hmap_min_unitigs.find(Minimizer(&it_min_h.s[pos]).rep()) != hmap_min_unitigs.end() ? 0 : pos - it_min_h.p);
583  }
584 
585  UnitigMap<U, G> find(const char* s, const size_t pos_km, const minHashIterator<RepHash>& it_min, const bool extremities_only = false);
586  const_UnitigMap<U, G> find(const char* s, const size_t pos_km, const minHashIterator<RepHash>& it_min, const bool extremities_only = false) const;
587 
588  UnitigMap<U, G> find(const Kmer& km, const preAllocMinHashIterator<RepHash>& it_min_h);
589 
590  vector<const_UnitigMap<U, G>> findPredecessors(const Kmer& km, const bool extremities_only = false) const;
591  vector<const_UnitigMap<U, G>> findSuccessors(const Kmer& km, const size_t limit = 4, const bool extremities_only = false) const;
592 
593  vector<UnitigMap<U, G>> findPredecessors(const Kmer& km, const bool extremities_only = false);
594  vector<UnitigMap<U, G>> findSuccessors(const Kmer& km, const size_t limit = 4, const bool extremities_only = false);
595 
596  UnitigMap<U, G> findUnitig(const Kmer& km, const char* s, const size_t pos);
597  UnitigMap<U, G> findUnitig(const Kmer& km, const char* s, const size_t pos, const preAllocMinHashIterator<RepHash>& it_min_h);
598  UnitigMap<U, G> findUnitig(const char* s, const size_t pos, const size_t len, const minHashIterator<RepHash>& it_min);
599 
600  bool addUnitig(const string& str_unitig, const size_t id_unitig);
601  bool addUnitig(const string& str_unitig, const size_t id_unitig, const size_t id_unitig_r, const size_t is_short_r);
602  void swapUnitigs(const bool isShort, const size_t id_a, const size_t id_b);
603 
604  bool mergeUnitig(const string& seq, const bool verbose = false);
605  bool annotateSplitUnitig(const string& seq, const bool verbose = false);
606  bool annotateSplitUnitig(const string& seq, LockGraph& lck_g, const bool verbose = false);
607 
608  template<bool is_void>
609  inline typename std::enable_if<!is_void, void>::type mergeData_(const UnitigMap<U, G>& a, const const_UnitigMap<U, G>& b){
610 
611  a.getData()->merge(a, b);
612  }
613 
614  template<bool is_void>
615  inline typename std::enable_if<is_void, void>::type mergeData_(const UnitigMap<U, G>& a, const const_UnitigMap<U, G>& b) {}
616 
617  template<bool is_void>
618  typename std::enable_if<!is_void, void>::type deleteUnitig_(const bool isShort, const bool isAbundant,
619  const size_t id_unitig, const bool delete_data = true);
620 
621  template<bool is_void>
622  typename std::enable_if<is_void, void>::type deleteUnitig_( const bool isShort, const bool isAbundant,
623  const size_t id_unitig, const bool delete_data = true);
624 
625  template<bool is_void>
626  typename std::enable_if<!is_void, bool>::type extractUnitig_(size_t& pos_v_unitigs, size_t& nxt_pos_insert_v_unitigs,
627  size_t& v_unitigs_sz, size_t& v_kmers_sz, const vector<pair<int,int>>& sp);
628  template<bool is_void>
629  typename std::enable_if<is_void, bool>::type extractUnitig_(size_t& pos_v_unitigs, size_t& nxt_pos_insert_v_unitigs,
630  size_t& v_unitigs_sz, size_t& v_kmers_sz, const vector<pair<int,int>>& sp);
631 
632  pair<size_t, size_t> extractAllUnitigs();
633 
634  template<bool is_void>
635  typename std::enable_if<!is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins = nullptr, const size_t nb_threads = 1);
636 
637  template<bool is_void>
638  typename std::enable_if<is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins = nullptr, const size_t nb_threads = 1);
639 
640  void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<Kmer>& joins, const size_t nb_threads) const;
641  bool checkJoin(const Kmer& a, const const_UnitigMap<U, G>& cm_a, Kmer& b) const;
642  void check_fp_tips(KmerHashTable<bool>& ignored_km_tips);
643  size_t removeUnitigs(bool rmIsolated, bool clipTips, vector<Kmer>& v);
644 
645  size_t joinTips(string filename_MBBF_uniq_kmers, const size_t nb_threads = 1, const bool verbose = false);
646  vector<Kmer> extractMercyKmers(BlockedBloomFilter& bf_uniq_km, const size_t nb_threads = 1, const bool verbose = false);
647 
648  void writeGFA(const string& graphfilename, const size_t nb_threads = 1) const;
649  void writeFASTA(const string& graphfilename) const;
650 
651  void readGFA(const string& graphfilename);
652  void readFASTA(const string& graphfilename);
653 
654  template<bool is_void>
655  typename std::enable_if<!is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap) const;
656  template<bool is_void>
657  typename std::enable_if<is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap) const;
658 
659  void mapRead(const const_UnitigMap<U, G>& um);
660  void mapRead(const const_UnitigMap<U, G>& um, LockGraph& lck_g);
661  void unmapRead(const const_UnitigMap<U, G>& um);
662 
663  void setKmerGmerLength(const int kmer_length, const int minimizer_length);
664  void print() const;
665 
666  vector<Minimizer> test(const Minimizer minz) const;
667 
668  int k_;
669  int g_;
670 
671  bool invalid;
672 
673  static const int tiny_vector_sz = 2;
674  static const int min_abundance_lim = 15;
675  static const int max_abundance_lim = 15;
676 
677  typedef KmerHashTable<CompressedCoverage_t<U>> h_kmers_ccov_t;
678  typedef MinimizerHashTable_2Val hmap_min_unitigs_t;
679 
680  typedef typename hmap_min_unitigs_t::iterator hmap_min_unitigs_iterator;
681  typedef typename hmap_min_unitigs_t::const_iterator hmap_min_unitigs_const_iterator;
682 
683  vector<Unitig<U>*> v_unitigs;
684  vector<pair<Kmer, CompressedCoverage_t<U>>> v_kmers;
685 
686  hmap_min_unitigs_t hmap_min_unitigs;
687 
688  h_kmers_ccov_t h_kmers_ccov;
689 
690  BlockedBloomFilter bf;
691 
692  wrapperData<G> data;
693 };
694 
695 #include "CompactedDBG.tcc"
696 
697 #endif
bool operator!=(const neighborIterator &o) const
Inequality operator: check if two neighborIterator are different.
bool deleteIsolated
Remove short isolated unitigs (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:152
void merge(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Merge the data of a sub-unitig B to the data of a sub-unitig A.
Definition: CompactedDBG.hpp:240
unitigIterator< U, G, true > const_iterator
A constant iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:312
Iterator for the neighbors (predecessors or successors) of a reference unitig used in a UnitigMap obj...
Definition: NeighborIterator.hpp:34
string outFilenameBBF
String containing the name of a Bloom filter file that will be generated by CompactedDBG<U, G>::filter.
Definition: CompactedDBG.hpp:136
Iterator for the unitigs of a Compacted de Bruijn graph.
Definition: UnitigIterator.hpp:36
size_t k
Length of k-mers (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:146
unitigIterator< U, G, false > iterator
An iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:311
string prefixFilenameOut
Prefix for the name of the file to which the graph must be written.
Definition: CompactedDBG.hpp:157
bool outputGFA
Boolean indicating if the graph is written to a GFA file (true) or if the unitigs are written to a FA...
Definition: CompactedDBG.hpp:155
size_t nb_bits_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least once in the FASTA/FASTQ/GFA files of CDBG_Bu...
Definition: CompactedDBG.hpp:132
Unitig_data_ptr_t getData() const
Get a pointer to the data associated with the reference unitig used in the mapping.
int getK() const
Return the length of k-mers of the graph.
Definition: CompactedDBG.hpp:536
G * getData()
Return a pointer to the graph data.
Definition: CompactedDBG.hpp:546
const G * getData() const
Return a constant pointer to the graph data.
Definition: CompactedDBG.hpp:551
bool build
Boolean indicating if the graph must be built.
Definition: CompactedDBG.hpp:148
vector< string > filename_seq_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:138
Most members of this structure are parameters for CompactedDBG<U, G>::build(), except for: ...
Definition: CompactedDBG.hpp:124
Represent a Compacted de Bruijn graph.
Definition: CompactedDBG.hpp:297
UnitigMap type interface.
The unitigIterator type interface.
size_t nb_bits_non_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least twice in the FASTA/FASTQ/GFA files of CDBG_B...
Definition: CompactedDBG.hpp:133
bool verbose
Print information messages during execution if true.
Definition: CompactedDBG.hpp:126
void extract(const UnitigMap< Unitig_data_t, Graph_data_t > &um_src, bool last_extraction)
Extract data corresponding to a sub-unitig of a unitig A.
Definition: CompactedDBG.hpp:256
Interface to store and manipulate k-mers.
Definition: Kmer.hpp:40
bool clipTips
Clip short tips (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:151
bool operator==(const neighborIterator &o) const
Equality operator: check if two neighborIterator are the same.
Contain all the information for the mapping of a k-mer or a sequence to a unitig of a Compacted de Br...
Definition: NeighborIterator.hpp:12
The Unitig interface.
string filename_graph_in
String containing the name of a GFA file to read using CompactedDBG<U, G>::read.
Definition: CompactedDBG.hpp:159
If data are to be associated with the unitigs of the compacted de Bruijn graph, those data must be wr...
Definition: CompactedDBG.hpp:204
void clear(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest)
Clear the data associated with a unitig.
Definition: CompactedDBG.hpp:213
size_t size() const
Return the number of unitigs in the graph.
Definition: CompactedDBG.hpp:541
Interface for the class Kmer:
bool isInvalid() const
Return a boolean indicating if the graph is invalid (wrong input parameters/files, error occurring during a method, etc.).
Definition: CompactedDBG.hpp:531
vector< string > filename_ref_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:139
bool update
Boolean indicating if the graph must be updated.
Definition: CompactedDBG.hpp:149
size_t read_chunksize
Number of reads a thread can read and process at a time.
Definition: CompactedDBG.hpp:128
size_t nb_threads
Number of threads to use for building the graph.
Definition: CompactedDBG.hpp:127
void concat(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Join data of two unitigs which are going to be concatenated.
Definition: CompactedDBG.hpp:228
bool useMercyKmers
Keep in the graph low coverage k-mers (cov=1) connecting tips of the graph.
Definition: CompactedDBG.hpp:153
string inFilenameBBF
String containing the name of a Bloom filter file that is generated by CompactedDBG<U, G>::filter.
Definition: CompactedDBG.hpp:135
string serialize(const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src) const
Serialize the data to a GFA-formatted string.
Definition: CompactedDBG.hpp:266