BamTools  2.4.0
BamAlignment.h
Go to the documentation of this file.
1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
9 
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
12 
13 #include "api/api_global.h"
14 #include "api/BamAux.h"
15 #include "api/BamConstants.h"
16 #include <cstdlib>
17 #include <cstring>
18 #include <string>
19 #include <vector>
20 
21 namespace BamTools {
22 
24 // forward declaration of BamAlignment's "friends"
25 namespace Internal {
26  class BamReaderPrivate;
27  class BamWriterPrivate;
28 } // namespace Internal
30 
31 // BamAlignment data structure
33 
34  // constructors & destructor
35  public:
36  BamAlignment(void);
37  BamAlignment(const BamAlignment& other);
38  ~BamAlignment(void);
39 
40  // queries against alignment flags
41  public:
42  bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
43  bool IsFailedQC(void) const; // returns true if this read failed quality control
44  bool IsFirstMate(void) const; // returns true if alignment is first mate on read
45  bool IsMapped(void) const; // returns true if alignment is mapped
46  bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
47  bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
48  bool IsPaired(void) const; // returns true if alignment part of paired-end read
49  bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
50  bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
51  bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
52  bool IsSecondMate(void) const; // returns true if alignment is second mate on read
53 
54  // manipulate alignment flags
55  public:
56  void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
57  void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
58  void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
59  void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
60  void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
61  void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
62  void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
63  void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
64  void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
65  void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
66  void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
67 
68  // tag data access methods
69  public:
70 
71  // add a new tag
72  template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
73  template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
74 
75  // edit (or append) tag
76  template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
77  template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
78 
79  // retrieves tag data
80  template<typename T> bool GetTag(const std::string& tag, T& destination) const;
81  template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
82 
83  // retrieves all current tag names
84  std::vector<std::string> GetTagNames(void) const;
85 
86  // retrieves the SAM/BAM type-code for requested tag name
87  bool GetTagType(const std::string& tag, char& type) const;
88 
89  // retrieves the SAM/BAM type-code for the data elements in an array tag
90  bool GetArrayTagType(const std::string& tag, char& type) const;
91 
92  // returns true if alignment has a record for this tag name
93  bool HasTag(const std::string& tag) const;
94 
95  // removes a tag
96  void RemoveTag(const std::string& tag);
97 
98  // additional methods
99  public:
100  // populates alignment string fields
101  bool BuildCharData(void);
102 
103  // calculates alignment end position
104  int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
105 
106  // returns a description of the last error that occurred
107  std::string GetErrorString(void) const;
108 
109  // retrieves the size, read locations and reference locations of soft-clip operations
110  bool GetSoftClips(std::vector<int>& clipSizes,
111  std::vector<int>& readPositions,
112  std::vector<int>& genomePositions,
113  bool usePadded = false) const;
114 
115  // public data fields
116  public:
117  std::string Name; // read name
118  int32_t Length; // length of query sequence
119  std::string QueryBases; // 'original' sequence (contained in BAM file)
120  std::string AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
121  std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
122  std::string TagData; // tag data (use provided methods to query/modify)
123  int32_t RefID; // ID number for reference sequence
124  int32_t Position; // position (0-based) where alignment starts
125  uint16_t Bin; // BAM (standard) index bin number for this alignment
126  uint16_t MapQuality; // mapping quality score
127  uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
128  std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
129  int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
130  int32_t MatePosition; // position (0-based) where alignment's mate starts
131  int32_t InsertSize; // mate-pair insert size
132  std::string Filename; // name of BAM file which this alignment comes from
133 
135  // internal utility methods
136  private:
137  bool FindTag(const std::string& tag,
138  char*& pTagData,
139  const unsigned int& tagDataLength,
140  unsigned int& numBytesParsed) const;
141  bool IsValidSize(const std::string& tag, const std::string& type) const;
142  void SetErrorString(const std::string& where, const std::string& what) const;
143  bool SkipToNextTag(const char storageType,
144  char*& pTagData,
145  unsigned int& numBytesParsed) const;
146 
147  // internal data
148  private:
149 
150  struct BamAlignmentSupportData {
151 
152  // data members
153  std::string AllCharData;
154  uint32_t BlockLength;
155  uint32_t NumCigarOperations;
156  uint32_t QueryNameLength;
157  uint32_t QuerySequenceLength;
158  bool HasCoreOnly;
159 
160  // constructor
161  BamAlignmentSupportData(void)
162  : BlockLength(0)
163  , NumCigarOperations(0)
164  , QueryNameLength(0)
165  , QuerySequenceLength(0)
166  , HasCoreOnly(false)
167  { }
168  };
169  BamAlignmentSupportData SupportData;
170  friend class Internal::BamReaderPrivate;
171  friend class Internal::BamWriterPrivate;
172 
173  mutable std::string ErrorString; // mutable to allow updates even in logically const methods
175 };
176 
177 // ---------------------------------------------------------
178 // BamAlignment tag access methods
179 
191 template<typename T>
192 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) {
193 
194  // if char data not populated, do that first
195  if ( SupportData.HasCoreOnly )
196  BuildCharData();
197 
198  // check tag/type size
199  if ( !IsValidSize(tag, type) ) {
200  // TODO: set error string?
201  return false;
202  }
203 
204  // check that storage type code is OK for T
205  if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
206  // TODO: set error string?
207  return false;
208  }
209 
210  // localize the tag data
211  char* pTagData = (char*)TagData.data();
212  const unsigned int tagDataLength = TagData.size();
213  unsigned int numBytesParsed = 0;
214 
215  // if tag already exists, return false
216  // use EditTag explicitly instead
217  if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
218  // TODO: set error string?
219  return false;
220  }
221 
222  // otherwise, convert value to string
223  union { T value; char valueBuffer[sizeof(T)]; } un;
224  un.value = value;
225 
226  // copy original tag data to temp buffer
227  const std::string newTag = tag + type;
228  const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
229  RaiiBuffer originalTagData(newTagDataLength);
230  memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
231 
232  // append newTag
233  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
234  memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
235 
236  // store temp buffer back in TagData
237  const char* newTagData = (const char*)originalTagData.Buffer;
238  TagData.assign(newTagData, newTagDataLength);
239  return true;
240 }
241 
242 template<>
243 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
244  const std::string& type,
245  const std::string& value)
246 {
247  // if char data not populated, do that first
248  if ( SupportData.HasCoreOnly )
249  BuildCharData();
250 
251  // check tag/type size
252  if ( !IsValidSize(tag, type) ) {
253  // TODO: set error string?
254  return false;
255  }
256 
257  // check that storage type code is OK for string
258  if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
259  // TODO: set error string?
260  return false;
261  }
262 
263  // localize the tag data
264  char* pTagData = (char*)TagData.data();
265  const unsigned int tagDataLength = TagData.size();
266  unsigned int numBytesParsed = 0;
267 
268  // if tag already exists, return false
269  // use EditTag explicitly instead
270  if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
271  // TODO: set error string?
272  return false;
273  }
274 
275  // otherwise, copy tag data to temp buffer
276  const std::string newTag = tag + type + value;
277  const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
278  RaiiBuffer originalTagData(newTagDataLength);
279  memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
280 
281  // append newTag (removes original null-term, then appends newTag + null-term)
282  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
283 
284  // store temp buffer back in TagData
285  const char* newTagData = (const char*)originalTagData.Buffer;
286  TagData.assign(newTagData, newTagDataLength);
287  return true;
288 }
289 
300 template<typename T>
301 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) {
302 
303  // if char data not populated, do that first
304  if ( SupportData.HasCoreOnly )
305  BuildCharData();
306 
307  // check for valid tag name length
308  if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
309  return false;
310 
311  // localize the tag data
312  char* pTagData = (char*)TagData.data();
313  const unsigned int tagDataLength = TagData.size();
314  unsigned int numBytesParsed = 0;
315 
316  // if tag already exists, return false
317  // use EditTag explicitly instead
318  if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
319  // TODO: set error string?
320  return false;
321  }
322 
323  // build new tag's base information
324  char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
325  memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
326  newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
327  newTagBase[3] = TagTypeHelper<T>::TypeCode();
328 
329  // add number of array elements to newTagBase
330  const int32_t numElements = values.size();
331  memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
332 
333  // copy current TagData string to temp buffer, leaving room for new tag's contents
334  const size_t newTagDataLength = tagDataLength +
336  numElements*sizeof(T);
337  RaiiBuffer originalTagData(newTagDataLength);
338  memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
339 
340  // write newTagBase (removes old null term)
341  strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
342 
343  // add vector elements to tag
344  int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
345  for ( int i = 0 ; i < numElements; ++i ) {
346  const T& value = values.at(i);
347  memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
348  }
349 
350  // store temp buffer back in TagData
351  const char* newTagData = (const char*)originalTagData.Buffer;
352  TagData.assign(newTagData, newTagDataLength);
353  return true;
354 }
355 
370 template<typename T>
371 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) {
372 
373  // if char data not populated, do that first
374  if ( SupportData.HasCoreOnly )
375  BuildCharData();
376 
377  // remove existing tag if present, then append tag with new value
378  if ( HasTag(tag) )
379  RemoveTag(tag);
380  return AddTag(tag, type, value);
381 }
382 
394 template<typename T>
395 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) {
396 
397  // if char data not populated, do that first
398  if ( SupportData.HasCoreOnly )
399  BuildCharData();
400 
401  // remove existing tag if present, then append tag with new values
402  if ( HasTag(tag) )
403  RemoveTag(tag);
404  return AddTag(tag, values);
405 }
406 
407 
415 template<typename T>
416 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const {
417 
418  // skip if alignment is core-only
419  if ( SupportData.HasCoreOnly ) {
420  // TODO: set error string?
421  return false;
422  }
423 
424  // skip if no tags present
425  if ( TagData.empty() ) {
426  // TODO: set error string?
427  return false;
428  }
429 
430  // localize the tag data
431  char* pTagData = (char*)TagData.data();
432  const unsigned int tagDataLength = TagData.size();
433  unsigned int numBytesParsed = 0;
434 
435  // return failure if tag not found
436  if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
437  // TODO: set error string?
438  return false;
439  }
440 
441  // fetch data type
442  const char type = *(pTagData - 1);
443  if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
444  // TODO: set error string ?
445  return false;
446  }
447 
448  // determine data length
449  int destinationLength = 0;
450  switch ( type ) {
451 
452  // 1 byte data
456  destinationLength = 1;
457  break;
458 
459  // 2 byte data
462  destinationLength = 2;
463  break;
464 
465  // 4 byte data
469  destinationLength = 4;
470  break;
471 
472  // var-length types not supported for numeric destination
476  SetErrorString("BamAlignment::GetTag",
477  "cannot store variable length tag data into a numeric destination");
478  return false;
479 
480  // unrecognized tag type
481  default:
482  const std::string message = std::string("invalid tag type: ") + type;
483  SetErrorString("BamAlignment::GetTag", message);
484  return false;
485  }
486 
487  // store data in destination
488  destination = 0;
489  memcpy(&destination, pTagData, destinationLength);
490 
491  // return success
492  return true;
493 }
494 
495 template<>
496 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
497  std::string& destination) const
498 {
499  // skip if alignment is core-only
500  if ( SupportData.HasCoreOnly ) {
501  // TODO: set error string?
502  return false;
503  }
504 
505  // skip if no tags present
506  if ( TagData.empty() ) {
507  // TODO: set error string?
508  return false;
509  }
510 
511  // localize the tag data
512  char* pTagData = (char*)TagData.data();
513  const unsigned int tagDataLength = TagData.size();
514  unsigned int numBytesParsed = 0;
515 
516  // return failure if tag not found
517  if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
518  // TODO: set error string?
519  return false;
520  }
521 
522  // otherwise copy data into destination
523  const unsigned int dataLength = strlen(pTagData);
524  destination.clear();
525  destination.resize(dataLength);
526  memcpy( (char*)destination.data(), pTagData, dataLength );
527 
528  // return success
529  return true;
530 }
531 
539 template<typename T>
540 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const {
541 
542  // skip if alignment is core-only
543  if ( SupportData.HasCoreOnly ) {
544  // TODO: set error string?
545  return false;
546  }
547 
548  // skip if no tags present
549  if ( TagData.empty() ) {
550  // TODO: set error string?
551  return false;
552  }
553 
554  // localize the tag data
555  char* pTagData = (char*)TagData.data();
556  const unsigned int tagDataLength = TagData.size();
557  unsigned int numBytesParsed = 0;
558 
559  // return false if tag not found
560  if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
561  // TODO: set error string?
562  return false;
563  }
564 
565  // check that tag is array type
566  const char tagType = *(pTagData - 1);
567  if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
568  SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
569  return false;
570  }
571 
572  // fetch element type
573  const char elementType = *pTagData;
574  if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
575  // TODO: set error string ?
576  return false;
577  }
578  ++pTagData;
579 
580  // calculate length of each element in tag's array
581  int elementLength = 0;
582  switch ( elementType ) {
586  elementLength = sizeof(uint8_t);
587  break;
588 
591  elementLength = sizeof(uint16_t);
592  break;
593 
597  elementLength = sizeof(uint32_t);
598  break;
599 
600  // var-length types not supported for numeric destination
604  SetErrorString("BamAlignment::GetTag",
605  "invalid array data, variable-length elements are not allowed");
606  return false;
607 
608  // unknown tag type
609  default:
610  const std::string message = std::string("invalid array element type: ") + elementType;
611  SetErrorString("BamAlignment::GetTag", message);
612  return false;
613  }
614 
615  // get number of elements
616  int32_t numElements;
617  memcpy(&numElements, pTagData, sizeof(int32_t));
618  pTagData += 4;
619  destination.clear();
620  destination.reserve(numElements);
621 
622  // read in elements
623  T value;
624  for ( int i = 0 ; i < numElements; ++i ) {
625  memcpy(&value, pTagData, sizeof(T));
626  pTagData += sizeof(T);
627  destination.push_back(value);
628  }
629 
630  // return success
631  return true;
632 }
633 
634 typedef std::vector<BamAlignment> BamAlignmentVector;
635 
636 } // namespace BamTools
637 
638 #endif // BAMALIGNMENT_H
const char BAM_TAG_TYPE_FLOAT
Definition: BamConstants.h:80
const char BAM_TAG_TYPE_UINT16
Definition: BamConstants.h:77
const uint8_t BAM_TAG_ARRAYBASE_SIZE
Definition: BamConstants.h:87
int32_t InsertSize
mate-pair insert size
Definition: BamAlignment.h:131
std::string TagData
tag data (use the provided methods to query/modify)
Definition: BamAlignment.h:122
const char BAM_TAG_TYPE_INT8
Definition: BamConstants.h:74
The main BAM alignment data structure.
Definition: BamAlignment.h:32
uint32_t AlignmentFlag
alignment bit-flag (use the provided methods to query/modify)
Definition: BamAlignment.h:127
const char BAM_TAG_TYPE_INT32
Definition: BamConstants.h:78
uint16_t Bin
BAM (standard) index bin number for this alignment.
Definition: BamAlignment.h:125
bool GetTag(const std::string &tag, T &destination) const
Definition: BamAlignment.h:416
std::string Filename
name of BAM file which this alignment comes from
Definition: BamAlignment.h:132
const char BAM_TAG_TYPE_ARRAY
Definition: BamConstants.h:83
std::vector< CigarOp > CigarData
CIGAR operations for this alignment.
Definition: BamAlignment.h:128
#define API_EXPORT
Definition: api_global.h:18
const char BAM_TAG_TYPE_HEX
Definition: BamConstants.h:82
const char BAM_TAG_TYPE_ASCII
Definition: BamConstants.h:73
std::string QueryBases
&#39;original&#39; sequence (as reported from sequencing machine)
Definition: BamAlignment.h:119
int32_t Position
position (0-based) where alignment starts
Definition: BamAlignment.h:124
bool AddTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:192
int32_t MateRefID
ID number for reference sequence where alignment&#39;s mate was aligned.
Definition: BamAlignment.h:129
std::string Name
read name
Definition: BamAlignment.h:117
std::string AlignedBases
&#39;aligned&#39; sequence (includes any indels, padding, clipping)
Definition: BamAlignment.h:120
const char BAM_TAG_TYPE_INT16
Definition: BamConstants.h:76
const char BAM_TAG_TYPE_STRING
Definition: BamConstants.h:81
uint16_t MapQuality
mapping quality score
Definition: BamAlignment.h:126
Contains all BamTools classes & methods.
Definition: Sort.h:24
const uint8_t BAM_TAG_TAGSIZE
Definition: BamConstants.h:85
int32_t MatePosition
position (0-based) where alignment&#39;s mate starts
Definition: BamAlignment.h:130
std::vector< BamAlignment > BamAlignmentVector
Definition: BamAlignment.h:634
int32_t Length
length of query sequence
Definition: BamAlignment.h:118
const char BAM_TAG_TYPE_UINT8
Definition: BamConstants.h:75
std::string Qualities
FASTQ qualities (ASCII characters, not numeric values)
Definition: BamAlignment.h:121
const char BAM_TAG_TYPE_UINT32
Definition: BamConstants.h:79
int32_t RefID
ID number for reference sequence.
Definition: BamAlignment.h:123
bool EditTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:371