From 07bbab5821892814a6f4c94a943bd7957ee74144 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Sat, 1 Sep 2018 14:22:17 -0700 Subject: [PATCH 01/16] Use CIGAR field and MD aux field to reconstruct alignments from BAM records --- src/bam/md_align.rs | 1254 ++++++++++++++++++++++++++++++++++++++++ src/bam/mod.rs | 1 + test/test_md_align.bam | Bin 0 -> 2221 bytes 3 files changed, 1255 insertions(+) create mode 100644 src/bam/md_align.rs create mode 100644 test/test_md_align.bam diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs new file mode 100644 index 000000000..28dc6e5dc --- /dev/null +++ b/src/bam/md_align.rs @@ -0,0 +1,1254 @@ +use std::fmt::Write; +use std::str; + +use bio_types::alignment::{Alignment, AlignmentMode, AlignmentOperation}; + +use bam; +use bam::record::{Cigar,CigarString}; + +/// Position in a BAM record alignment, based on CIGAR and MD +/// information. +pub trait MDAlignPos { + /// Read nucleotide sequence as reported in the BAM + /// alignment. _Note_ that this is the complement of the read + /// sequence itself for reverse-strand alignments. + /// + /// `None` is returned for read deletions. + fn read_nt(&self) -> Option; + + /// Reference nucleotide sequence as reported in the BAM alignment. + /// + /// `None` is returned for read insertions and + /// soft clipped positions. + fn ref_nt(&self) -> Option; + + /// Read nucleotide quality score as reported in the BAM + /// alignment. + /// + /// `None` is returned at read deletion positions. + fn read_qual(&self) -> Option; + + /// Zero-based offset in the read sequence as reported in the BAM + /// alignment. _Note_ that these positions will run from the last + /// to the first base in the original input sequence for + /// reverse-strand alignments. + /// + /// `None is returned for read deletions. + fn read_seq_pos(&self) -> Option; + + /// Zero-based offset in the read sequence as reported in the BAM + /// alignment. For read deletions, the offset of the next aligned + /// read sequence nucleotide is reported. _Note_ that these + /// positions will run from the last to the first base in the + /// original input sequence for reverse-strand alignments. + fn read_seq_pos_or_next(&self) -> u32; + + /// Zero-based offset in the reference sequence. + /// + /// `None` is returned for read insertions and + /// soft clipped positions. + fn ref_pos(&self) -> Option; + + /// Zero-based offset in the reference sequence. For read + /// insertions, the offset of the next aligned reference sequence + /// nucleotide is reported. + /// + /// `None` is returned for soft clipped positions. + fn ref_pos_or_next(&self) -> Option; + + /// Zero-based offset for this position in the original input read + /// sequence. Position 0 is the first nucleotide from the input + /// read sequence for either forward or reverse strand alignments. + /// + /// `None` is returned for read deletions. + fn read_true_pos(&self) -> Option; + + /// Nucleotide in the original input read sequence, taking into + /// account the fact that reverse-strand alignments report the + /// reverse-complement of the input read sequence. + /// + /// `None` is returned for read deletions. + fn read_true_nt(&self) -> Option; + + /// Nucleotide for the reference sequence that is matched against + /// `read_true_nt()`. _Note_ that this is the complement of the + /// nucleotide in the reference sequence (as per `ref_nt`) for + /// reverse-strand alignments. + /// + /// `None` is returned for read insertions and soft-clipped + /// positions. + fn ref_true_nt(&self) -> Option; + + /// Character for read sequence line in pretty-printed alignment + /// format. This character is the read sequence base, in + /// lower-case for soft-clipped positions, or a `-` for read + /// deletions. + fn read_line_char(&self) -> char { + if self.ref_pos_or_next().is_none() { + self.read_nt().map_or('-', |nt| nt.to_ascii_lowercase() as char) + } else { + self.read_nt().map_or('-', |nt| nt as char) + } + } + + /// Character for middle match line in pretty-printed alignment + /// format. This is a vertical bar at match positions and a space + /// otherwise. + fn match_line_char(&self) -> char { + let read = self.read_nt(); + if read.is_some() && read == self.ref_nt() { + '|' + } else { + ' ' + } + } + + /// Character for reference sequence line in pretty-printed + /// alignment format. This character is the reference sequence + /// base when present, a `-` for read insertions, and a space for + /// soft-clipped positions. + fn ref_line_char(&self) -> char { + self.ref_nt().map_or_else(|| + if self.ref_pos_or_next().is_none() { + ' ' + } else { + '-' + }, + |nt| nt as char) + } +} + +/// Self-contained data type containing all information about an +/// alignment position in a BAM record. +#[derive(Debug,Clone,PartialEq,Eq)] +pub struct AlignPos { + ref_nt: Option, + read_nt_qual: Option<(u8, u8)>, + is_reverse: bool, + read_seq_pos_or_next: u32, + ref_pos_or_next: Option, + read_len: u32, +} + +impl MDAlignPos for AlignPos { + fn read_nt(&self) -> Option + { + self.read_nt_qual.as_ref().map(|(nt, _q)| *nt) + } + + fn ref_nt(&self) -> Option { self.ref_nt } + + fn read_qual(&self) -> Option + { + self.read_nt_qual.as_ref().map(|(_nt, q)| *q) + } + + fn read_seq_pos(&self) -> Option + { + if self.read_nt_qual.is_some() { + Some(self.read_seq_pos_or_next) + } else { + None + } + } + + fn read_seq_pos_or_next(&self) -> u32 { self.read_seq_pos_or_next } + + fn ref_pos(&self) -> Option + { + if self.ref_nt.is_some() { + self.ref_pos_or_next + } else { + None + } + } + + fn ref_pos_or_next(&self) -> Option { self.ref_pos_or_next } + + fn read_true_pos(&self) -> Option + { + self.read_seq_pos() + .map(|seq_pos| + if self.is_reverse { + self.read_len - (1 + seq_pos) + } else { + seq_pos + }) + } + + fn read_true_nt(&self) -> Option + { + self.read_nt() + .map(|seq_nt| + if self.is_reverse { + fast_compl(seq_nt) + } else { + seq_nt + }) + } + + fn ref_true_nt(&self) -> Option + { + self.ref_nt() + .map(|seq_nt| + if self.is_reverse { + fast_compl(seq_nt) + } else { + seq_nt + }) + } +} + +/// Iterator that generates `AlignPos` alignment positions for a BAM +/// record. +/// +/// Note that these positions are generated in reference sequence +/// order. For a reverse-strand alignment, they run from the last to +/// the first sequenced base. +pub struct AlignPosIter { + align_iter: MDPosIter, + read_seq: Vec, + read_qual: Vec, + is_reverse: bool, +} + +impl AlignPosIter { + /// Create a new iterator for a BAM record. + /// + /// # Arguments + /// + /// * `record` is the BAM record whose alignment will be extracted + pub fn new_from_record(record: &bam::record::Record) -> Result + { + let align_iter = MDPosIter::new_from_record(record)?; + Ok( AlignPosIter { align_iter: align_iter, + read_seq: record.seq().as_bytes(), + read_qual: record.qual().to_vec(), + is_reverse: record.is_reverse(), } ) + } + + fn pos_to_align_pos(&self, pos: MDPos) -> Result + { + let read_seq_pos = pos.read_seq_pos(); + let ref_pos = pos.ref_pos(); + let read_nt_qual = match read_seq_pos { + Some(p) => Some( (*self.read_seq.get(p as usize).ok_or_else(|| MDAlignError::BadSeqLen)?, + *self.read_qual.get(p as usize).ok_or_else(|| MDAlignError::BadSeqLen)?) ), + None => None, + }; + // When we don't have a ref_nt but we do have a ref_pos, this should be a match + // and there should be a defined read_nt + let ref_nt = pos.ref_nt().or_else(|| + if ref_pos.is_some() { + Some( read_nt_qual.unwrap().0 ) + } else { + None + }); + Ok( AlignPos { ref_nt: ref_nt, + read_nt_qual: read_nt_qual, + is_reverse: self.is_reverse, + read_seq_pos_or_next: pos.read_seq_pos_or_next(), + ref_pos_or_next: pos.ref_pos_or_next(), + read_len: self.read_seq.len() as u32 } ) + } + + /// Collects all alignment positions into a _read strand_ vector + /// of alignment positions. This is right to left on the reference + /// sequence, for reverse strand alignments. + #[allow(unused_must_use)] + pub fn read_strand_alignment(&mut self) -> Result,MDAlignError> + { + let mut aln: Result, MDAlignError> = self.collect(); + + if self.is_reverse { + aln.as_mut().map(|a| a.reverse()); + } + + aln + } +} + +impl Iterator for AlignPosIter { + type Item = Result; + + fn next(&mut self) -> Option + { + self.align_iter.next().map(|res| + { + let pos = res?; + self.pos_to_align_pos(pos) + }) + } +} + +/// Alignment position on a BAM record. Read sequence and quality +/// information can be extracted directly because a reference to the +/// original record is retained. +pub struct RecAlignPos<'a> { + record: &'a bam::record::Record, + pos: MDPos, +} + +impl <'a> RecAlignPos<'a> { + fn new(record: &'a bam::record::Record, pos: MDPos) -> Self + { + RecAlignPos { record: record, pos: pos } + } + + fn read_nt_at(&self, at: u32) -> u8 { + self.record.seq()[at as usize] + } + + fn qual_at(&self, at: u32) -> u8 { + self.record.qual()[at as usize] + } +} + +impl <'a> MDAlignPos for RecAlignPos<'a> { + fn read_nt(&self) -> Option { + self.pos.read_seq_pos() + .map(|pos| self.read_nt_at(pos)) + } + + fn ref_nt(&self) -> Option { + self.pos.ref_nt().or_else(|| + if self.ref_pos().is_some() { + self.read_nt() + } else { + None + }) + } + + fn read_qual(&self) -> Option { + self.pos.read_seq_pos() + .map(|pos| self.qual_at(pos)) + } + + fn read_seq_pos(&self) -> Option { self.pos.read_seq_pos() } + + fn read_seq_pos_or_next(&self) -> u32 { self.pos.read_seq_pos_or_next() } + + fn ref_pos(&self) -> Option { self.pos.ref_pos() } + + fn ref_pos_or_next(&self) -> Option { self.pos.ref_pos_or_next() } + + fn read_true_pos(&self) -> Option { + self.read_seq_pos() + .map(|seq_pos| + if self.record.is_reverse() { + self.record.qual().len() as u32 - (1 + seq_pos) + } else { + seq_pos + }) + } + + fn read_true_nt(&self) -> Option { + self.read_nt().map(|nt| + if self.record.is_reverse() { + fast_compl(nt) + } else { + nt + }) + } + + fn ref_true_nt(&self) -> Option { + self.ref_nt().map(|nt| + if self.record.is_reverse() { + fast_compl(nt) + } else { + nt + }) + } +} + +/// Iterator that generates `RecAlignPos` alignment positions for a BAM +/// record. +/// +/// Note that these positions are generated in reference sequence +/// order. For a reverse-strand alignment, they run from the last to +/// the first sequenced base. +pub struct RecAlignPosIter<'a> { + align_iter: MDPosIter, + record: &'a bam::record::Record, +} + +impl <'a> RecAlignPosIter<'a> { + /// Create a new iterator for a BAM record. + /// + /// # Arguments + /// + /// * `record` is the BAM record whose alignment will be extracted + pub fn new(record: &'a bam::record::Record) -> Result { + let align_iter = MDPosIter::new_from_record(record)?; + Ok( RecAlignPosIter{ align_iter: align_iter, record: record } ) + } + + /// Collects all alignment positions into a _read strand_ vector + /// of alignment positions. This is right to left on the reference + /// sequence, for reverse strand alignments. + #[allow(unused_must_use)] + pub fn read_strand_alignment(&mut self) -> Result>,MDAlignError> + { + let mut aln: Result>, MDAlignError> = self.collect(); + + if self.record.is_reverse() { + aln.as_mut().map(|a| a.reverse()); + } + + aln + } +} + +impl <'a> Iterator for RecAlignPosIter<'a> { + type Item = Result, MDAlignError>; + + fn next(&mut self) -> Option + { + self.align_iter.next() + .map(|res| res.map(|mdpos| RecAlignPos::new(self.record, mdpos))) + } +} + +/// Alignment position based on information in the CIGAR and MD aux +/// field for a BAM record. The `MDPos` entries for a BAM record +/// represent all information from these fields -- together with the +/// sequence itself, these can fully reconstruct the read-vs-reference +/// alignment. +#[derive(Debug,Clone,PartialEq,Eq)] +pub enum MDPos { + Match{read_seq_pos: u32, ref_pos: u32}, + Mismatch{ref_nt: u8, read_seq_pos: u32, ref_pos: u32}, + Insert{read_seq_pos: u32, ref_pos_next: u32}, + Delete{ref_nt: u8, read_seq_pos_next: u32, ref_pos: u32}, + SoftClip{read_seq_pos: u32}, +} + +#[allow(unused_variables)] +impl MDPos { + /// Zero-based offset in the read sequence as reported in the BAM + /// alignment. _Note_ that these positions will run from the last + /// to the first base in the original input sequence for + /// reverse-strand alignments. + /// + /// `None is returned for read deletions. + pub fn read_seq_pos(&self) -> Option { + match self { + MDPos::Match{ read_seq_pos, ref_pos } => Some(*read_seq_pos), + MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*read_seq_pos), + MDPos::Insert{ read_seq_pos, ref_pos_next } => Some(*read_seq_pos), + MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => None, + MDPos::SoftClip{ read_seq_pos } => Some(*read_seq_pos), + } + } + + /// Zero-based offset in the read sequence as reported in the BAM + /// alignment. For read deletions, the offset of the next aligned + /// read sequence nucleotide is reported. _Note_ that these + /// positions will run from the last to the first base in the + /// original input sequence for reverse-strand alignments. + pub fn read_seq_pos_or_next(&self) -> u32 { + match self { + MDPos::Match{ read_seq_pos, ref_pos } => *read_seq_pos, + MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => *read_seq_pos, + MDPos::Insert{ read_seq_pos, ref_pos_next } => *read_seq_pos, + MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => *read_seq_pos_next, + MDPos::SoftClip{ read_seq_pos } => *read_seq_pos, + } + } + + /// Zero-based offset in the reference sequence. + /// + /// `None` is returned for read insertions and + /// soft clipped positions. + pub fn ref_pos(&self) -> Option { + match self { + MDPos::Match{ read_seq_pos, ref_pos } => Some(*ref_pos), + MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*ref_pos), + MDPos::Insert{ read_seq_pos, ref_pos_next } => None, + MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => Some(*ref_pos), + MDPos::SoftClip{ read_seq_pos } => None, + } + } + + /// Zero-based offset in the reference sequence. For read + /// insertions, the offset of the next aligned reference sequence + /// nucleotide is reported. + /// + /// `None` is returned for soft clipped positions. + pub fn ref_pos_or_next(&self) -> Option { + match self { + MDPos::Match{ read_seq_pos, ref_pos } => Some(*ref_pos), + MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*ref_pos), + MDPos::Insert{ read_seq_pos, ref_pos_next } => Some(*ref_pos_next), + MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => Some(*ref_pos), + MDPos::SoftClip{ read_seq_pos } => None, + } + } + + /// Reference nucleotide, _when it differs from the read nucleotide_. + /// + /// `None` is returned for matches, read insertions and soft + /// clipped positions. + pub fn ref_nt(&self) -> Option { + match self { + MDPos::Match{ read_seq_pos, ref_pos } => None, + MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*ref_nt), + MDPos::Insert{ read_seq_pos, ref_pos_next } => None, + MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => Some(*ref_nt), + MDPos::SoftClip{ read_seq_pos } => None, + } + } +} + +/// Iterator over the `MDPos` positions represented by a BAM +/// record. +/// +/// Note that these positions are generated in reference sequence +/// order. For a reverse-strand alignment, they run from the last to +/// the first sequenced base. +#[derive(Debug)] +pub struct MDPosIter { + md_stack: Vec, + cigar_stack: Vec, + ref_pos_curr: u32, + read_pos_curr: u32, +} + +impl MDPosIter { + /// Create a new iterator for a BAM record. + /// + /// # Arguments + /// + /// * `record` is the BAM record whose alignment will be extracted + pub fn new_from_record(record: &bam::record::Record) -> Result { + let mut md_stack = MatchDesc::parse_from_record(record)?; + md_stack.reverse(); + + let CigarString(mut cigar_stack) = (*record.cigar()).clone(); + cigar_stack.reverse(); + + Ok( MDPosIter { md_stack: md_stack, + cigar_stack: cigar_stack, + ref_pos_curr: record.pos() as u32, + read_pos_curr: 0 } ) + } + + // Utility function that yields the next MDPos. + // Requires the cigar stack is non-empty + // Requires that 0-length matches and non-yielding cigar entries + // are cleared from the tops of those respective stacks. + fn next_with_some(&mut self) -> Result + { + let pop_cigar; + + let res = match self.cigar_stack.last_mut().unwrap() { + Cigar::Match(ref mut ciglen) => { + let pop_md; + let mmm = match self.md_stack.last_mut().ok_or_else(|| MDAlignError::MDvsCIGAR)? { + MatchDesc::Matches(ref mut mdlen) => { + let pos = MDPos::Match { read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, }; + self.read_pos_curr += 1; + self.ref_pos_curr += 1; + *mdlen -= 1; + pop_md = *mdlen == 0; + *ciglen -= 1; + pop_cigar = *ciglen == 0; + pos + }, + MatchDesc::Mismatch(ref_nt) => { + let pos = MDPos::Mismatch { ref_nt: *ref_nt, + read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, }; + self.read_pos_curr += 1; + self.ref_pos_curr += 1; + pop_md = true; + *ciglen -= 1; + pop_cigar = *ciglen == 0; + pos + }, + _ => { + return Err( MDAlignError::MDvsCIGAR ); + } + }; + if pop_md { + self.md_stack.pop(); + } + mmm + }, + Cigar::Equal(ref mut ciglen) => { + let pop_md; + let m = match self.md_stack.last_mut().ok_or_else(|| MDAlignError::MDvsCIGAR)? { + MatchDesc::Matches(ref mut mdlen) => { + let pos = MDPos::Match { read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, }; + self.read_pos_curr += 1; + self.ref_pos_curr += 1; + + *mdlen -= 1; + pop_md = *mdlen == 0; + *ciglen -= 1; + pop_cigar = *ciglen == 0; + pos + }, + _ => return Err( MDAlignError::MDvsCIGAR ), + }; + if pop_md { + self.md_stack.pop(); + } + m + }, + Cigar::Diff(ref mut ciglen) => { + let pop_md; + let mm = match self.md_stack.last_mut().ok_or_else(|| MDAlignError::MDvsCIGAR)? { + MatchDesc::Mismatch(ref_nt) => { + let pos = MDPos::Mismatch { ref_nt: *ref_nt, + read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, }; + self.read_pos_curr += 1; + self.ref_pos_curr += 1; + pop_md = true; + *ciglen -= 1; + pop_cigar = *ciglen == 0; + pos + }, + _ => return Err( MDAlignError::MDvsCIGAR ), + }; + if pop_md { + self.md_stack.pop(); + } + mm + } + Cigar::Ins(ref mut len) => { + let mut pos = MDPos::Insert { read_seq_pos: self.read_pos_curr, + ref_pos_next: self.ref_pos_curr }; + self.read_pos_curr += 1; + *len -= 1; + pop_cigar = *len == 0; + pos + }, + Cigar::Del(ref mut len) => { + let pop_md; + + let del = match self.md_stack.last_mut() { + Some(MatchDesc::Deletion(ref mut ref_nts)) => { + if ref_nts.len() > 0 { + let ref_nt = ref_nts.remove(0); + pop_md = ref_nts.is_empty(); + let pos = MDPos::Delete { ref_nt: ref_nt, + ref_pos: self.ref_pos_curr, + read_seq_pos_next: self.read_pos_curr }; + self.ref_pos_curr += 1; + pos + } else { + return Err( MDAlignError::MDvsCIGAR ); + } + }, + _ => return Err( MDAlignError::MDvsCIGAR ), + }; + *len -= 1; + pop_cigar = *len == 0; + if pop_md { + self.md_stack.pop(); + } + del + }, + Cigar::SoftClip(ref mut len) => { + let pos = MDPos::SoftClip { read_seq_pos: self.read_pos_curr }; + self.read_pos_curr += 1; + *len -= 1; + pop_cigar = *len == 0; + pos + }, + Cigar::RefSkip(_) => panic!("RefSkip in next_with_some"), + Cigar::HardClip(_) => panic!("HardClip in next_with_some"), + Cigar::Pad(_) => panic!("Pad in next_with_some"), + }; + + if pop_cigar { + self.cigar_stack.pop(); + } + + Ok( res ) + } + + /// Collect the alignment positions from this record into an + /// `Alignment`. This is always a semi-global alignment that + /// includes the soft clipping from the read as `Xclip` operations. + pub fn collect_into_alignment(self) -> Result { + let mds_result: Result,MDAlignError> = self.collect(); + let mds = mds_result?; + let mut iter = mds.as_slice().into_iter(); + + let mut ops = Vec::new(); + + let mut curr = iter.next(); + + let mut left_clip = 0; + while curr.ok_or_else(||MDAlignError::EmptyAlign)?.ref_pos_or_next().is_none() { + left_clip += 1; + curr = iter.next(); + } + ops.push(AlignmentOperation::Xclip(left_clip)); + + let (xstart, ystart) = if let Some(md) = curr { + (md.read_seq_pos_or_next(), md.ref_pos_or_next().unwrap()) + } else { + panic!("No MDPos remaining after left clip") + }; + + let mut xend = xstart; + let mut yend = ystart; + + while let Some(md) = curr { + match md { + MDPos::Match{read_seq_pos, ref_pos} => { + ops.push(AlignmentOperation::Match); + xend = *read_seq_pos; + yend = *ref_pos; + }, + MDPos::Mismatch{ref_nt: _, read_seq_pos, ref_pos} => { + ops.push(AlignmentOperation::Subst); + xend = *read_seq_pos; + yend = *ref_pos; + }, + MDPos::Insert{read_seq_pos, ref_pos_next: _} => { + ops.push(AlignmentOperation::Ins); + xend = *read_seq_pos; + }, + MDPos::Delete{ref_nt: _, read_seq_pos_next: _, ref_pos} => { + ops.push(AlignmentOperation::Del); + yend = *ref_pos; + }, + MDPos::SoftClip{read_seq_pos: _} => { + break + }, + } + curr = iter.next(); + } + + let mut right_clip = 0; + let mut xlen = xend; + while let Some(md) = curr { + match md { + MDPos::SoftClip{read_seq_pos} => { + xlen = *read_seq_pos; + }, + _ => { + return Err( MDAlignError::BadMD ); + }, + }; + right_clip += 1; + curr = iter.next(); + } + ops.push(AlignmentOperation::Xclip(right_clip)); + + Ok( Alignment { + score: 0, + xstart: xstart as usize, + ystart: ystart as usize, + xend: xend as usize, + yend: yend as usize, + ylen: (1 + yend - ystart) as usize, + xlen: xlen as usize, + operations: ops, + mode: AlignmentMode::Semiglobal + }) + } +} + +impl Iterator for MDPosIter { + type Item = Result; + + fn next(&mut self) -> Option + { + // Process non-yielding cigar entries + loop { + let handled = match self.cigar_stack.last() { + Some(Cigar::RefSkip(len)) => { + self.ref_pos_curr += *len; + true + }, + Some(Cigar::HardClip(_len)) => true, + Some(Cigar::Pad(_len)) => true, + _ => false, + }; + if handled { + self.cigar_stack.pop(); + } else { + break; + } + } + + // Consume 0-length match + while self.md_stack.last() == Some(&MatchDesc::Matches(0)) { + self.md_stack.pop(); + } + + if self.cigar_stack.is_empty() { + None + } else { + Some( self.next_with_some().map_err(|e| ::std::convert::From::from(e)) ) + } + } +} + +/// Abstract representation of entries in an MD aux field +#[derive(Debug,Clone,PartialEq,Eq)] +pub enum MatchDesc { + Matches(u32), + Mismatch(u8), + Deletion(Vec) +} + +impl MatchDesc { + /// Create a new, matching MD entry. + /// + /// # Arguments + /// + /// * `len` is the length of the match + pub fn new_matches(len: u32) -> Self { MatchDesc::Matches(len) } + + /// Create a new mismatched MD entry. + /// + /// # Arguments + /// + /// * `refnt` is the reference base at the mismatched position + /// + /// # Errors + /// + /// If `refnt` is not an upper-case ASCII character, an error + /// variant is returned. + pub fn new_mismatch(refnt: u8) -> Result + { + if refnt.is_ascii_uppercase() { + Ok( MatchDesc::Mismatch(refnt) ) + } else { + Err( MDAlignError::BadMD ) + } + } + + /// Create a new read-deletion MD entry + /// + /// # Arguments + /// + /// * `refnts` are the reference bases at the deletion position + /// + /// # Errors + /// + /// If `refnts` is empty or contains invalid (non upper-case + /// ASCII) characters, an error variant is returned. + pub fn new_deletion(refnts: &[u8]) -> Result + { + if refnts.len() > 0 && refnts.iter().all(|&c| c.is_ascii_uppercase()) { + Ok( MatchDesc::Deletion(refnts.to_vec()) ) + } else { + Err( MDAlignError::BadMD ) + } + } + + /// True if this `MatchDesc` represents a run of matches + pub fn is_matches(&self) -> bool + { + match self { + MatchDesc::Matches(_) => true, + _ => false, + } + } + + /// True if this `MatchDesc` represents a mismatch + pub fn is_mismatch(&self) -> bool + { + match self { + MatchDesc::Mismatch(_) => true, + _ => false, + } + } + + /// True if this `MatchDesc` represents a read deletion relative to the reference + pub fn is_deletion(&self) -> bool + { + match self { + MatchDesc::Deletion(_) => true, + _ => false, + } + } + + /// Create a `Vec` of `MatchDesc` entries corresponding to the MD aux field on a record. + /// + /// # Arguments + /// + /// * `record` is the source of the MD aux field to parse + /// + /// # Errors + /// + /// If `record` has no MD aux field, or if the value of the aux field is malformed, + /// an error variant will be returned. + pub fn parse_from_record(record: &bam::record::Record) -> Result,MDAlignError> { + Self::parse(Self::get_md(record)?) + } + + /// Parses a bytestring as an MD aux field description. + /// + /// # Arguments + /// + /// * `mdstring` is a bytestring MD aux field + /// + /// # Errors + /// + /// If the `mdstring` is not a well-formed MD aux field, an error variant is returned. + /// + /// # Examples + /// + /// ``` + /// use rust_htslib::bam::md_align::{MatchDesc,MDAlignError}; + /// # fn try_main() -> Result<(), MDAlignError> { + /// assert_eq!(MatchDesc::parse(b"10A5^AC6")?, + /// [MatchDesc::Matches(10), + /// MatchDesc::Mismatch(b'A'), + /// MatchDesc::Matches(5), + /// MatchDesc::Deletion(b"AC".to_vec()), + /// MatchDesc::Matches(6)]); + /// # Ok( () ) + /// # } + /// # fn main() { try_main().unwrap(); } + /// ``` + pub fn parse(mdstring: &[u8]) -> Result,MDAlignError> { + MatchDescIter::new(mdstring).collect() + } + + /// Convert a vector of `MatchDesc` entries into an MD aux field + /// string. The string will be partly normalized by merging + /// adjacent matches (but not deletions), which guarantees + /// unambiguous parsing. + /// + /// # Arguments + /// + /// * `mds` are a vector of `MatchDesc` entries + /// * `strict_match0` controls whether 0-length matches are added + /// whenever a mismatch or deletion is not followed immediately by a + /// run of matches. Strictly adding a 0-length match between a mismatch + /// and a subsequent mismatch, deletion, or end-of-string ensures + /// that the resulting MD field string matches the regexp + /// + /// `[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*` + /// + /// _Note_ that 0-length matches are always added when a deletion is + /// followed by a mismatch, which is required for unambiguous parsing. + /// + /// # Examples + /// + /// ``` + /// use rust_htslib::bam::md_align::{MatchDesc,MDAlignError}; + /// # fn try_main() -> Result<(), MDAlignError> { + /// let mdvec = vec![MatchDesc::Matches(10), + /// MatchDesc::Mismatch(b'A'), + /// MatchDesc::Matches(5), + /// MatchDesc::Deletion(b"AC".to_vec()), + /// MatchDesc::Matches(6)]; + /// assert_eq!(MatchDesc::unparse(&mdvec, false), "10A5^AC6"); + /// assert_eq!(MatchDesc::unparse(&mdvec, true), "10A5^AC6"); + /// let unnorm = vec![MatchDesc::Matches(10), + /// MatchDesc::Mismatch(b'A'), + /// MatchDesc::Mismatch(b'G'), + /// MatchDesc::Matches(2), + /// MatchDesc::Matches(2), + /// MatchDesc::Deletion(b"AC".to_vec()), + /// MatchDesc::Mismatch(b'T'), + /// MatchDesc::Matches(5)]; + /// assert_eq!(MatchDesc::unparse(&unnorm, false), "10AG4^AC0T5"); + /// assert_eq!(MatchDesc::unparse(&unnorm, true), "10A0G4^AC0T5"); + /// # Ok( () ) + /// # } + /// # fn main() { try_main().unwrap(); } + /// ``` + pub fn unparse(mds: &[MatchDesc], strict_match0: bool) -> String { + let mut mdstr = String::new(); + let mut match_accum = 0; + + let mut md_iter = mds.iter().peekable(); + while let Some(md) = md_iter.next() { + match md { + MatchDesc::Matches(mlen) => { + if md_iter.peek().map_or(false, |next| next.is_matches()) { + match_accum += mlen; + } else { + write!(&mut mdstr, "{}", match_accum + mlen).unwrap(); + match_accum = 0; + } + }, + MatchDesc::Mismatch(refnt) => { + if strict_match0 && md_iter.peek().map_or(true, |next| !next.is_matches()) { + write!(&mut mdstr, "{}0", *refnt as char).unwrap(); + } else { + write!(&mut mdstr, "{}", *refnt as char).unwrap(); + } + }, + MatchDesc::Deletion(refnts) => { + if strict_match0 { + if md_iter.peek().map_or(true, |next| !next.is_matches()) { + write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); + } else { + write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); + } + } else { + if md_iter.peek().map_or(false, |next| next.is_mismatch()) { + write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); + } else { + write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); + } + } + } + }; + } + + mdstr + } + + /// Extract an MD aux field bytestring from a BAM record. + /// + /// # Arguments + /// + /// * `rec` is the record whose MD aux field is extracted + /// + /// # Errors + /// + /// If `rec` does not have a string type MD aux field, an error variant is returned. + pub fn get_md(rec: &bam::record::Record) -> Result<&[u8],MDAlignError> { + rec.aux(b"MD").map_or_else(|| Err(MDAlignError::NoMD), + |aux| match aux { bam::record::Aux::String(md) => Ok(md), _ => Err(MDAlignError::NoMD) }) + } +} + +/// Iterator over `MatchDesc` entries in an MD aux field +pub struct MatchDescIter<'a> { + md: &'a [u8] +} + +impl<'a> MatchDescIter<'a> { + /// Create a new iterator over an MD aux field string + /// + /// # Arguments + /// + /// * `md` is a bytestring of an MD aux field entry + pub fn new(md: &'a [u8]) -> Self { + MatchDescIter{ md: md } + } + + /// Create a new iterator over the MD aux field from a BAM record + /// + /// # Arguments + /// + /// * `record` is the source of the MD aux field for the iterator + /// + /// # Errors + /// + /// If the record does not have a string-valued MD aux field, an + /// error variant is returned. + pub fn new_from_record(record: &'a bam::record::Record) -> Result { + Ok( Self::new(MatchDesc::get_md(record)?) ) + } + + // Requires self.md is non-empty, guaranteed to yield a MatchDesc + fn next_nonempty(&mut self) -> Result + { + let ch = self.md.first().unwrap(); + if ch.is_ascii_digit() { + let endpos = self.md.iter().position(|&c| !c.is_ascii_digit()).unwrap_or(self.md.len()); + let numstr = str::from_utf8(&self.md[0..endpos])?; + self.md = &self.md[endpos..]; + Ok( MatchDesc::Matches(numstr.parse()?) ) + } else if *ch == b'^' { + let endpos = self.md.iter().skip(1).position(|&c| !c.is_ascii_uppercase()).unwrap_or(self.md.len() - 1) + 1; + let del = MatchDesc::Deletion(self.md[1..endpos].to_vec()); + self.md = &self.md[endpos..]; + Ok( del ) + } else if ch.is_ascii_uppercase() { + self.md = &self.md[1..]; + Ok( MatchDesc::Mismatch(*ch) ) + } else { + Err( MDAlignError::BadMD ) + } + } +} + +impl<'a> Iterator for MatchDescIter<'a> { + type Item = Result; + + fn next(&mut self) -> Option + { + if self.md.is_empty() { + None + } else { + Some( self.next_nonempty() ) + } + } +} + +quick_error! { + #[derive(Debug,Clone)] + pub enum MDAlignError { + NoMD { + description("no MD aux field") + } + BadMD { + description("bad MD value") + } + MDvsCIGAR { + description("MD inconsistent with CIGAR") + } + BadSeqLen { + description("Sequence/quality length inconsistent with MD/CIGAR") + } + EmptyAlign { + description("Alignment has no positions") + } + ParseInt(err: ::std::num::ParseIntError) { + from() + } + Utf8(err: ::std::str::Utf8Error) { + from() + } + } +} + +fn fast_compl(nt: u8) -> u8 { + unsafe { + *COMPL.get_unchecked((nt as usize) & 0x7f) + } +} + +const NT_NONE: u8 = b'*'; +static COMPL: [u8; 128] = + [ NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x10 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x20 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x30 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x40 + NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, +// 0x50 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x60 + NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, +// 0x70 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE]; + +#[cfg(test)] +mod tests { + use std::path::Path; + use super::*; + use super::super::*; + + const N_TEST: usize = 14; + + static TEST_READ_NAMES: [&[u8]; N_TEST] + = [ b"K00364:89:HWTF2BBXX:2:1102:3376:38293", + b"K00364:89:HWTF2BBXX:2:1102:3092:38293", + b"K00364:89:HWTF2BBXX:2:1102:3356:38293", + b"K00364:89:HWTF2BBXX:2:1102:3640:38293", + b"K00364:89:HWTF2BBXX:2:1102:20994:38293", + b"K00364:89:HWTF2BBXX:2:1102:18852:38310", + b"K00364:89:HWTF2BBXX:2:1102:7395:38363", + b"K00364:89:HWTF2BBXX:2:1102:12560:38310", + b"K00364:89:HWTF2BBXX:2:1102:13017:38328", + b"K00364:89:HWTF2BBXX:2:1102:31324:38293", + b"K00364:89:HWTF2BBXX:2:1102:27630:38293", + b"K00364:89:HWTF2BBXX:2:1102:15240:38310", + b"K00364:89:HWTF2BBXX:2:1102:3062:38310", + b"K00364:89:HWTF2BBXX:2:1102:30371:38293", + ]; + + static TEST_CIGAR: [&str; N_TEST] = + [ "151M", "151M", "151M", "151M", "149M2S", "149M2S", + "2S149M", "2S149M", "44M2D107M", "133M2D11M7S", + "47M1I103M", "103M1I47M", "53M99N98M", "48M1998N103M" ]; + + static TEST_MD: [&[u8]; N_TEST] = + [ b"151", b"151", b"41T87T21", b"137A13", b"96T52", b"93A0A54", + b"127T21", b"149", b"44^GT107", b"8A68A26A15A12^TG1T9", + b"76T73", b"150", b"151", b"75A75" ]; + + static TEST_ALIGN_CIGAR: [&str; N_TEST] = + [ "151=", "151=", "41=1X87=1X21=", "137=1X13=", "96=1X52=2S", "93=2X54=2S", + "2S127=1X21=", "2S149=", "44=2D107=", "8=1X68=1X26=1X15=1X12=2D1=1X9=7S", + "47=1I29=1X73=", "103=1I47=", "151=", "75=1X75=" ]; + + static TEST_READ_LINE: [&str; N_TEST] = + ["TATCTCTGCTATGGTCGATGCTGCTAAGGATCAAGTTGCCCTAAGAATGCCAGAATTGATTCCAGTCTTGTCTGAAACCATGTGGGACACCAAGAAGGAAGTCAAGGCTGCTGCTACTGCCGCCATGACCAAGGCTACCGAAACTGTTGAC", + "CATAGCAGTGGTTTCAGAAATGTTGGCAGACATTCTGTGGAAAACAGTGAAGTCACCGTTACCCAAGGTGTGGTGCAACAACAATTGCTTAGCTTGAGCAGAGATGGATGGGACACCAACAACGTGCAAAACACCGACGTGTTCAGCGTAA", + "CGTTGACTATGATCAAGTACCAGACGAAATTCAATCAAGAACTGCGTTATGCTTAATGCATAATATGAAGAGAATTATGGACTCTAGCTTTAAATTAGATGAATTGACTGTACAAGATGATTTAATATTCGATGGTTCCTTGATAACTGAG", + "CTGCAAAGCTATATAATGCCACGGCCTTTGGAGAAGACGTGGATGGGGATGTTGGAGCAGTGAATCTGCCCTGACTAGTTTGCGGTGTTGAAGCGGACGAGATTCTAGATTTAGAAAATCTGTTCGACAAGTCATCGGCTTCACGGTCTTT", + "ACGACACAGGAAGACATGTCATTGTTACAACGGGTGTGGGGCAACATCAAATGTGGGCTGCTCAACACTGGACATGGAGAAATCCACATACTTTCACCACATCAGGTGGTTTAGGTACGATGGGTTACGGTCTCCCTGCCGCCATCGGTtc", + "TGCGGAACAGTTTTATTATTGGTACCACCCGTACTGGATATTGGTACGTTTGTATGATTAGTCTCATTGTCACTGTACGAGTCTGAGTGTCTGGGATCTTTAGATTTACTGGCGTGCGACGACTCATGTGTGTTAGATTGGGACATGGGgg", + "tcCCGAACCAGCTTTACAAAATAAGAATTGTATAGAAGCGACAGTAATGCAGTCCAAGGAACGCCCTAATGACAAGATCATACCAACCAAAACCGAAAAAAACGATTTTGGAATAGGCACTCAATGGTTCGAACGCAAACAAATATCAAGA", + "tcTCGGTTGGTGAAATCAAGTACTTAAAAGACCTAACAAACTCTACAATTTGCGCCTTTCTTTCCCCTAATAGAACATGTTAGATTTATAAGCGAAAACGGAAAAGAAAACTACAGCCCAGCAATAAGAGTGCAAATCAAAGTGTGCATGA", + "TTCCTCTACTTCTTACGCTATCAAGAAGAAGGATGAATTGGAAC--GTTGCCAAGTCTAACCGTTAAGAAGCTAAAAAAAGTGAAAGATTTTCAATATTACATATGTTTTTCTTATTATTACTTTTATCTTCTTTGTACACTCTATTGTTTAA", + "GCCCAAATGAAATGTTTTCCTTTACAGTACCATTCATTATCCATGGAACTTGTGAAACATAAGCAACAGAACCATGAGCGGTGGCGAAACCTTTAACCCTGAATGGATCACCTAACATGCGTGACAATAGAGC--TATTACCACTGagatcgg", + "GCTGGTGTTAAGGCTGAATGATTCGATCAAAAGCGAAGTATTTTTTTCTTCCTAGTTTTTATGTAAAAATATATTCACTTTTTTCTCTACGCAGTTGCTGCAGTGTTATAAGAAAAATAGAATGAAATAAGATAAAGCTAGACGAATTGTG", + "CGTCAGCATCGTGGTAGTCAAAAAGTTCATCTGCACCGTACTCTTTCAACAATTTTTCATGTTTACGAGAAGCAACGACGATGATCTTGCTGAAACCGTTTAGTTTTTTTTGCCAATTGAATAAGCATCTGGCCAACAGCAGTGGCACCAC", + "ACAAACCATGAATCCTTTGTAGGATCAATAAACGATCCTGAAGATGGACCTGCTGTTGCACATACTGTTTATTTGGCTGCCTTGGTATACCTCGTGTTTTTCGTATTCTGTGGGTTCCAAGTTTACCTAGCCAGAAGAAAACCTTCGATCG", + "CGACATATGGAGATACTTTATTTCCTTTTCTTAATTATTAACGTATACCTATAAATTAACAAAGTATCTAAACAAGATACATAAGTGTACTCAAACTGAGTAGAATCGTCGATTAAACTTCCTTCTCCTTTTAAAAATTAAAAACAGCAAA"]; + + static TEST_REF_LINE: [&str; N_TEST] = + ["TATCTCTGCTATGGTCGATGCTGCTAAGGATCAAGTTGCCCTAAGAATGCCAGAATTGATTCCAGTCTTGTCTGAAACCATGTGGGACACCAAGAAGGAAGTCAAGGCTGCTGCTACTGCCGCCATGACCAAGGCTACCGAAACTGTTGAC", + "CATAGCAGTGGTTTCAGAAATGTTGGCAGACATTCTGTGGAAAACAGTGAAGTCACCGTTACCCAAGGTGTGGTGCAACAACAATTGCTTAGCTTGAGCAGAGATGGATGGGACACCAACAACGTGCAAAACACCGACGTGTTCAGCGTAA", + "CGTTGACTATGATCAAGTACCAGACGAAATTCAATCAAGAATTGCGTTATGCTTAATGCATAATATGAAGAGAATTATGGACTCTAGCTTTAAATTAGATGAATTGACTGTACAAGATGATTTAATATTTGATGGTTCCTTGATAACTGAG", + "CTGCAAAGCTATATAATGCCACGGCCTTTGGAGAAGACGTGGATGGGGATGTTGGAGCAGTGAATCTGCCCTGACTAGTTTGCGGTGTTGAAGCGGACGAGATTCTAGATTTAGAAAATCTGTTCGACAAGTCATCGACTTCACGGTCTTT", + "ACGACACAGGAAGACATGTCATTGTTACAACGGGTGTGGGGCAACATCAAATGTGGGCTGCTCAACACTGGACATGGAGAAATCCACATACTTTCATCACATCAGGTGGTTTAGGTACGATGGGTTACGGTCTCCCTGCCGCCATCGGT ", + "TGCGGAACAGTTTTATTATTGGTACCACCCGTACTGGATATTGGTACGTTTGTATGATTAGTCTCATTGTCACTGTACGAGTCTGAGTGTCTGAAATCTTTAGATTTACTGGCGTGCGACGACTCATGTGTGTTAGATTGGGACATGGG ", + " CCGAACCAGCTTTACAAAATAAGAATTGTATAGAAGCGACAGTAATGCAGTCCAAGGAACGCCCTAATGACAAGATCATACCAACCAAAACCGAAAAAAACGATTTTGGAATAGGCACTCAATGGTTTGAACGCAAACAAATATCAAGA", + " TCGGTTGGTGAAATCAAGTACTTAAAAGACCTAACAAACTCTACAATTTGCGCCTTTCTTTCCCCTAATAGAACATGTTAGATTTATAAGCGAAAACGGAAAAGAAAACTACAGCCCAGCAATAAGAGTGCAAATCAAAGTGTGCATGA", + "TTCCTCTACTTCTTACGCTATCAAGAAGAAGGATGAATTGGAACGTGTTGCCAAGTCTAACCGTTAAGAAGCTAAAAAAAGTGAAAGATTTTCAATATTACATATGTTTTTCTTATTATTACTTTTATCTTCTTTGTACACTCTATTGTTTAA", + "GCCCAAATAAAATGTTTTCCTTTACAGTACCATTCATTATCCATGGAACTTGTGAAACATAAGCAACAGAACCATGAACGGTGGCGAAACCTTTAACCCTGAATAGATCACCTAACATGCATGACAATAGAGCTGTTTTACCACTG ", + "GCTGGTGTTAAGGCTGAATGATTCGATCAAAAGCGAAGTATTTTTTT-TTCCTAGTTTTTATGTAAAAATATATTCATTTTTTTCTCTACGCAGTTGCTGCAGTGTTATAAGAAAAATAGAATGAAATAAGATAAAGCTAGACGAATTGTG", + "CGTCAGCATCGTGGTAGTCAAAAAGTTCATCTGCACCGTACTCTTTCAACAATTTTTCATGTTTACGAGAAGCAACGACGATGATCTTGCTGAAACCGTTTAG-TTTTTTTGCCAATTGAATAAGCATCTGGCCAACAGCAGTGGCACCAC", + "ACAAACCATGAATCCTTTGTAGGATCAATAAACGATCCTGAAGATGGACCTGCTGTTGCACATACTGTTTATTTGGCTGCCTTGGTATACCTCGTGTTTTTCGTATTCTGTGGGTTCCAAGTTTACCTAGCCAGAAGAAAACCTTCGATCG", + "CGACATATGGAGATACTTTATTTCCTTTTCTTAATTATTAACGTATACCTATAAATTAACAAAGTATCTAAACAAAATACATAAGTGTACTCAAACTGAGTAGAATCGTCGATTAAACTTCCTTCTCCTTTTAAAAATTAAAAACAGCAAA"]; + + #[test] + fn test_md_align() { + let mut bam = Reader::from_path(&Path::new("test/test_md_align.bam")) + .ok() + .expect("Error opening file."); + + for (i, record) in bam.records().enumerate() { + let mut rec = record.ok().expect("Error reading BAM record"); + assert_eq!(rec.qname(), TEST_READ_NAMES[i]); + + assert_eq!(MatchDesc::get_md(&rec).ok().expect("No MD field"), TEST_MD[i]); + assert_eq!(rec.cigar().to_string(), TEST_CIGAR[i]); + + let ap_iter = AlignPosIter::new_from_record(&rec).ok().expect("Creating AlignPosIter"); + let ap_res: Result,MDAlignError> = ap_iter.collect(); + let aps = ap_res.ok().expect("Unable to create Vec"); + let ap_read_line: String = aps.iter().map(|ap| ap.read_line_char()).collect(); + assert_eq!(ap_read_line, TEST_READ_LINE[i]); + let ap_ref_line: String = aps.iter().map(|ap| ap.ref_line_char()).collect(); + assert_eq!(ap_ref_line, TEST_REF_LINE[i]); + + let rap_iter = RecAlignPosIter::new(&rec).ok().expect("Creating RecAlignPosIter"); + let rap_res: Result,MDAlignError> = rap_iter.collect(); + let raps = rap_res.ok().expect("Unable to create Vec"); + let rap_read_line: String = raps.iter().map(|rap| rap.read_line_char()).collect(); + assert_eq!(rap_read_line, TEST_READ_LINE[i]); + let rap_ref_line: String = raps.iter().map(|rap| rap.ref_line_char()).collect(); + assert_eq!(rap_ref_line, TEST_REF_LINE[i]); + + let md_iter = MDPosIter::new_from_record(&rec).ok().expect("Creating MDPosIter"); + let alignment = md_iter.collect_into_alignment().ok().expect("collecting into alignment"); + let a_cigar = alignment.cigar(false); + assert_eq!(a_cigar, TEST_ALIGN_CIGAR[i]); + } + } +} diff --git a/src/bam/mod.rs b/src/bam/mod.rs index 4a61233c2..e6f048aae 100644 --- a/src/bam/mod.rs +++ b/src/bam/mod.rs @@ -7,6 +7,7 @@ pub mod buffer; pub mod header; +pub mod md_align; pub mod pileup; pub mod record; diff --git a/test/test_md_align.bam b/test/test_md_align.bam new file mode 100644 index 0000000000000000000000000000000000000000..6c1f87eb7895b0a6671344cd031404ae87cddca7 GIT binary patch literal 2221 zcmV;e2vYYSiwFb&00000{{{d;LjnK}0(FwjZqq;%g>L~>I3Rw+q6>tuy3TlhJu~(y zK&1i-X{rb$R%K$W##QV@9+w7*1rpD|1F(P+2X=Z165x?&AMu*O55Jr=nP^Ui#87GYDCT1nplv6F5 zn3tGSDH&;EQDW+I8gSm^e#w0$wQ8lvs;44U(k50_3JFLcTIsqW7s|01y&5zSn#g9; zw5TrES1RcAhWpNwy>OCPQ!r;ZSnrmP#=P>`9`(aroMr`1vg6q_F;g?f(N&a7q4(d;V(@-)y;dB4{JK&r0ry_*)XImG^Q*OzaCm>?RPgTe zty*dL`Elb6VEAo&ttf#VbFEeszIQvdV(`h^cScQ!hfkq#eiG?j=J16UqSYSfzDhgAVKyJAP-wnQn+zdJ7 zX7V58keg4z*m2My$y>WUqt?er+X}Xt?V*>h?qQyJ>*=@V${Jb9M1@S=ZOD?Uj4u@u=LZs@Zg}+AZf*a_e}KSm(SF zjniJ$mTO1GXNt6_r50W|X_R-KnDoA3Omb?-NM6Ums?IZ^GinseiTR+@ZEVHrUrsKVbAu}_*}wdL z6J)P~Y|@MDXkK>6N?UVpg4E7 zSk8=bJhvy)6rZFQiXUY=G}$+R@;T7^`tuVYd%J&TC*92EK`RlX%B0RoIC7+!&x5D~ zd1-~voKp}DbG8na0vE_K^H$;@;55Q+obrN{KKry0KDf1+z4>iRcB4af17vr`i{06x z?BabLc#i_%htJ30UhW;->CUKwSJlLo5act~WWj6AnQ27R2;(&(O%8_8iU*$?}T z=`-)LjeglJNAvkkOZQ?7FNQNMWkXteCw#zUhY1>(z>zQw+$tIKIH@%`DXj`JV?s7? z2{#Oa2q5b+pVhz|(unR}b`O$F*MfK$_2SW|AG<9B{QVlj^XXzzb^zZ5;3J@N`gZ_4 zcn^SkrFXo$JBi+{#-kNrpKHcL5H>V6W0?WZc$f!oy|6~fV59;}qqH%q(Jok+K{dJ0 znO1n%c}N2)M(WQ#k$WG!pjdn6(f27<&&qoz2>^d{9s3$f%qu<8lbySmkGuuU$H3;5 zzYQRXi+yuEtLBqPd0Oogj-*|tnS@~_2{Z<8t>!u7R@B@{u34Q~q2XLkX zPz0};BRCjih>;MIpX>|xhu@Yl{ud~JwN4i0eI||%x@h0eZaxE9yxlv?AX8B&#fOrbv=fXbnEkf(s6H&1xo0Ot9rS7;A!x72}z#$#RY=;fQ-LJ$ghMg|RP$ z0^xX1oA+uQLjr_$v_!O-P+oK4dM zN8vBszvI?$E53U=JVtC}YJxj&7Dp$|FpR}y8{|bAZk%+Z8WRM41T#!Lzq9Nt6 z)DVP6A;jW&ZW^pCR3P*X(FmDYCTgRl;OHmty5}N)GCdy-Sw!Dva zcpn4rT|M2^4YFK& z!om*qnQF~#h&j%zL5vMS#lj)c0e-1@sJ+AmNIwZqK1$wVBA{{ya~7!@?tkQc{O#;s z2JoS1)_hCD_H29|J_7v1PjtIV8}n~Eu8znDZNz8O#jM-wZh-K!G1Ffh1>_rSv!|*j z<<3&o5pIMWVdsuZXBulHgyDp$S#A|N9E2U5Fad3uOQv%yU;vn~6pZi#ToW{+FI1C} zM9r?5OrPem@Ba5~{aqaB`rr3?WUzc7PDUNXZv*j9!R6DBo<-&OZQmSMqv@*H;oGS| zImOQ6&k^KQF!A>ai%yYw!qPeO%J^8AO%S0`wem(Vl}A$>;ZU&yd>)wM87B>|U=Eyp z4Be*y|L{LqT)U)Sh?IZX7Q2sM+C-3F+2|SMYE;d}tu3PP|M8mxTjWfLh6@&K0BaYp zqP8p&6@&xg)>%TkgRhN2or8_77P-@b7`(2s5a10wHc>);)cdoBr3eF~a vHn|J?#R&U Date: Tue, 4 Sep 2018 08:41:37 -0700 Subject: [PATCH 02/16] rustfmt --- src/bam/md_align.rs | 943 +++++++++++++++++++++++++++----------------- 1 file changed, 576 insertions(+), 367 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 28dc6e5dc..4c64b6896 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -4,7 +4,7 @@ use std::str; use bio_types::alignment::{Alignment, AlignmentMode, AlignmentOperation}; use bam; -use bam::record::{Cigar,CigarString}; +use bam::record::{Cigar, CigarString}; /// Position in a BAM record alignment, based on CIGAR and MD /// information. @@ -85,7 +85,8 @@ pub trait MDAlignPos { /// deletions. fn read_line_char(&self) -> char { if self.ref_pos_or_next().is_none() { - self.read_nt().map_or('-', |nt| nt.to_ascii_lowercase() as char) + self.read_nt() + .map_or('-', |nt| nt.to_ascii_lowercase() as char) } else { self.read_nt().map_or('-', |nt| nt as char) } @@ -108,19 +109,22 @@ pub trait MDAlignPos { /// base when present, a `-` for read insertions, and a space for /// soft-clipped positions. fn ref_line_char(&self) -> char { - self.ref_nt().map_or_else(|| - if self.ref_pos_or_next().is_none() { - ' ' - } else { - '-' - }, - |nt| nt as char) + self.ref_nt().map_or_else( + || { + if self.ref_pos_or_next().is_none() { + ' ' + } else { + '-' + } + }, + |nt| nt as char, + ) } } /// Self-contained data type containing all information about an /// alignment position in a BAM record. -#[derive(Debug,Clone,PartialEq,Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct AlignPos { ref_nt: Option, read_nt_qual: Option<(u8, u8)>, @@ -131,20 +135,19 @@ pub struct AlignPos { } impl MDAlignPos for AlignPos { - fn read_nt(&self) -> Option - { + fn read_nt(&self) -> Option { self.read_nt_qual.as_ref().map(|(nt, _q)| *nt) } - fn ref_nt(&self) -> Option { self.ref_nt } + fn ref_nt(&self) -> Option { + self.ref_nt + } - fn read_qual(&self) -> Option - { + fn read_qual(&self) -> Option { self.read_nt_qual.as_ref().map(|(_nt, q)| *q) } - fn read_seq_pos(&self) -> Option - { + fn read_seq_pos(&self) -> Option { if self.read_nt_qual.is_some() { Some(self.read_seq_pos_or_next) } else { @@ -152,10 +155,11 @@ impl MDAlignPos for AlignPos { } } - fn read_seq_pos_or_next(&self) -> u32 { self.read_seq_pos_or_next } + fn read_seq_pos_or_next(&self) -> u32 { + self.read_seq_pos_or_next + } - fn ref_pos(&self) -> Option - { + fn ref_pos(&self) -> Option { if self.ref_nt.is_some() { self.ref_pos_or_next } else { @@ -163,39 +167,38 @@ impl MDAlignPos for AlignPos { } } - fn ref_pos_or_next(&self) -> Option { self.ref_pos_or_next } - - fn read_true_pos(&self) -> Option - { - self.read_seq_pos() - .map(|seq_pos| - if self.is_reverse { - self.read_len - (1 + seq_pos) - } else { - seq_pos - }) + fn ref_pos_or_next(&self) -> Option { + self.ref_pos_or_next } - - fn read_true_nt(&self) -> Option - { - self.read_nt() - .map(|seq_nt| - if self.is_reverse { - fast_compl(seq_nt) - } else { - seq_nt - }) + + fn read_true_pos(&self) -> Option { + self.read_seq_pos().map(|seq_pos| { + if self.is_reverse { + self.read_len - (1 + seq_pos) + } else { + seq_pos + } + }) } - fn ref_true_nt(&self) -> Option - { - self.ref_nt() - .map(|seq_nt| - if self.is_reverse { - fast_compl(seq_nt) - } else { - seq_nt - }) + fn read_true_nt(&self) -> Option { + self.read_nt().map(|seq_nt| { + if self.is_reverse { + fast_compl(seq_nt) + } else { + seq_nt + } + }) + } + + fn ref_true_nt(&self) -> Option { + self.ref_nt().map(|seq_nt| { + if self.is_reverse { + fast_compl(seq_nt) + } else { + seq_nt + } + }) } } @@ -214,50 +217,60 @@ pub struct AlignPosIter { impl AlignPosIter { /// Create a new iterator for a BAM record. - /// + /// /// # Arguments /// /// * `record` is the BAM record whose alignment will be extracted - pub fn new_from_record(record: &bam::record::Record) -> Result - { + pub fn new_from_record(record: &bam::record::Record) -> Result { let align_iter = MDPosIter::new_from_record(record)?; - Ok( AlignPosIter { align_iter: align_iter, - read_seq: record.seq().as_bytes(), - read_qual: record.qual().to_vec(), - is_reverse: record.is_reverse(), } ) + Ok(AlignPosIter { + align_iter: align_iter, + read_seq: record.seq().as_bytes(), + read_qual: record.qual().to_vec(), + is_reverse: record.is_reverse(), + }) } - - fn pos_to_align_pos(&self, pos: MDPos) -> Result - { + + fn pos_to_align_pos(&self, pos: MDPos) -> Result { let read_seq_pos = pos.read_seq_pos(); let ref_pos = pos.ref_pos(); let read_nt_qual = match read_seq_pos { - Some(p) => Some( (*self.read_seq.get(p as usize).ok_or_else(|| MDAlignError::BadSeqLen)?, - *self.read_qual.get(p as usize).ok_or_else(|| MDAlignError::BadSeqLen)?) ), + Some(p) => Some(( + *self + .read_seq + .get(p as usize) + .ok_or_else(|| MDAlignError::BadSeqLen)?, + *self + .read_qual + .get(p as usize) + .ok_or_else(|| MDAlignError::BadSeqLen)?, + )), None => None, }; // When we don't have a ref_nt but we do have a ref_pos, this should be a match // and there should be a defined read_nt - let ref_nt = pos.ref_nt().or_else(|| - if ref_pos.is_some() { - Some( read_nt_qual.unwrap().0 ) - } else { - None - }); - Ok( AlignPos { ref_nt: ref_nt, - read_nt_qual: read_nt_qual, - is_reverse: self.is_reverse, - read_seq_pos_or_next: pos.read_seq_pos_or_next(), - ref_pos_or_next: pos.ref_pos_or_next(), - read_len: self.read_seq.len() as u32 } ) + let ref_nt = pos.ref_nt().or_else(|| { + if ref_pos.is_some() { + Some(read_nt_qual.unwrap().0) + } else { + None + } + }); + Ok(AlignPos { + ref_nt: ref_nt, + read_nt_qual: read_nt_qual, + is_reverse: self.is_reverse, + read_seq_pos_or_next: pos.read_seq_pos_or_next(), + ref_pos_or_next: pos.ref_pos_or_next(), + read_len: self.read_seq.len() as u32, + }) } /// Collects all alignment positions into a _read strand_ vector /// of alignment positions. This is right to left on the reference /// sequence, for reverse strand alignments. #[allow(unused_must_use)] - pub fn read_strand_alignment(&mut self) -> Result,MDAlignError> - { + pub fn read_strand_alignment(&mut self) -> Result, MDAlignError> { let mut aln: Result, MDAlignError> = self.collect(); if self.is_reverse { @@ -271,13 +284,11 @@ impl AlignPosIter { impl Iterator for AlignPosIter { type Item = Result; - fn next(&mut self) -> Option - { - self.align_iter.next().map(|res| - { - let pos = res?; - self.pos_to_align_pos(pos) - }) + fn next(&mut self) -> Option { + self.align_iter.next().map(|res| { + let pos = res?; + self.pos_to_align_pos(pos) + }) } } @@ -289,10 +300,12 @@ pub struct RecAlignPos<'a> { pos: MDPos, } -impl <'a> RecAlignPos<'a> { - fn new(record: &'a bam::record::Record, pos: MDPos) -> Self - { - RecAlignPos { record: record, pos: pos } +impl<'a> RecAlignPos<'a> { + fn new(record: &'a bam::record::Record, pos: MDPos) -> Self { + RecAlignPos { + record: record, + pos: pos, + } } fn read_nt_at(&self, at: u32) -> u8 { @@ -304,60 +317,69 @@ impl <'a> RecAlignPos<'a> { } } -impl <'a> MDAlignPos for RecAlignPos<'a> { +impl<'a> MDAlignPos for RecAlignPos<'a> { fn read_nt(&self) -> Option { - self.pos.read_seq_pos() - .map(|pos| self.read_nt_at(pos)) + self.pos.read_seq_pos().map(|pos| self.read_nt_at(pos)) } fn ref_nt(&self) -> Option { - self.pos.ref_nt().or_else(|| - if self.ref_pos().is_some() { - self.read_nt() - } else { - None - }) + self.pos.ref_nt().or_else(|| { + if self.ref_pos().is_some() { + self.read_nt() + } else { + None + } + }) } fn read_qual(&self) -> Option { + self.pos.read_seq_pos().map(|pos| self.qual_at(pos)) + } + + fn read_seq_pos(&self) -> Option { self.pos.read_seq_pos() - .map(|pos| self.qual_at(pos)) } - fn read_seq_pos(&self) -> Option { self.pos.read_seq_pos() } + fn read_seq_pos_or_next(&self) -> u32 { + self.pos.read_seq_pos_or_next() + } - fn read_seq_pos_or_next(&self) -> u32 { self.pos.read_seq_pos_or_next() } + fn ref_pos(&self) -> Option { + self.pos.ref_pos() + } - fn ref_pos(&self) -> Option { self.pos.ref_pos() } + fn ref_pos_or_next(&self) -> Option { + self.pos.ref_pos_or_next() + } - fn ref_pos_or_next(&self) -> Option { self.pos.ref_pos_or_next() } - fn read_true_pos(&self) -> Option { - self.read_seq_pos() - .map(|seq_pos| - if self.record.is_reverse() { - self.record.qual().len() as u32 - (1 + seq_pos) - } else { - seq_pos - }) + self.read_seq_pos().map(|seq_pos| { + if self.record.is_reverse() { + self.record.qual().len() as u32 - (1 + seq_pos) + } else { + seq_pos + } + }) } fn read_true_nt(&self) -> Option { - self.read_nt().map(|nt| - if self.record.is_reverse() { - fast_compl(nt) - } else { - nt - }) + self.read_nt().map(|nt| { + if self.record.is_reverse() { + fast_compl(nt) + } else { + nt + } + }) } fn ref_true_nt(&self) -> Option { - self.ref_nt().map(|nt| - if self.record.is_reverse() { - fast_compl(nt) - } else { - nt - }) + self.ref_nt().map(|nt| { + if self.record.is_reverse() { + fast_compl(nt) + } else { + nt + } + }) } } @@ -372,23 +394,25 @@ pub struct RecAlignPosIter<'a> { record: &'a bam::record::Record, } -impl <'a> RecAlignPosIter<'a> { +impl<'a> RecAlignPosIter<'a> { /// Create a new iterator for a BAM record. - /// + /// /// # Arguments /// /// * `record` is the BAM record whose alignment will be extracted - pub fn new(record: &'a bam::record::Record) -> Result { + pub fn new(record: &'a bam::record::Record) -> Result { let align_iter = MDPosIter::new_from_record(record)?; - Ok( RecAlignPosIter{ align_iter: align_iter, record: record } ) + Ok(RecAlignPosIter { + align_iter: align_iter, + record: record, + }) } /// Collects all alignment positions into a _read strand_ vector /// of alignment positions. This is right to left on the reference /// sequence, for reverse strand alignments. #[allow(unused_must_use)] - pub fn read_strand_alignment(&mut self) -> Result>,MDAlignError> - { + pub fn read_strand_alignment(&mut self) -> Result>, MDAlignError> { let mut aln: Result>, MDAlignError> = self.collect(); if self.record.is_reverse() { @@ -399,12 +423,12 @@ impl <'a> RecAlignPosIter<'a> { } } -impl <'a> Iterator for RecAlignPosIter<'a> { +impl<'a> Iterator for RecAlignPosIter<'a> { type Item = Result, MDAlignError>; - fn next(&mut self) -> Option - { - self.align_iter.next() + fn next(&mut self) -> Option { + self.align_iter + .next() .map(|res| res.map(|mdpos| RecAlignPos::new(self.record, mdpos))) } } @@ -414,13 +438,29 @@ impl <'a> Iterator for RecAlignPosIter<'a> { /// represent all information from these fields -- together with the /// sequence itself, these can fully reconstruct the read-vs-reference /// alignment. -#[derive(Debug,Clone,PartialEq,Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum MDPos { - Match{read_seq_pos: u32, ref_pos: u32}, - Mismatch{ref_nt: u8, read_seq_pos: u32, ref_pos: u32}, - Insert{read_seq_pos: u32, ref_pos_next: u32}, - Delete{ref_nt: u8, read_seq_pos_next: u32, ref_pos: u32}, - SoftClip{read_seq_pos: u32}, + Match { + read_seq_pos: u32, + ref_pos: u32, + }, + Mismatch { + ref_nt: u8, + read_seq_pos: u32, + ref_pos: u32, + }, + Insert { + read_seq_pos: u32, + ref_pos_next: u32, + }, + Delete { + ref_nt: u8, + read_seq_pos_next: u32, + ref_pos: u32, + }, + SoftClip { + read_seq_pos: u32, + }, } #[allow(unused_variables)] @@ -433,11 +473,25 @@ impl MDPos { /// `None is returned for read deletions. pub fn read_seq_pos(&self) -> Option { match self { - MDPos::Match{ read_seq_pos, ref_pos } => Some(*read_seq_pos), - MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*read_seq_pos), - MDPos::Insert{ read_seq_pos, ref_pos_next } => Some(*read_seq_pos), - MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => None, - MDPos::SoftClip{ read_seq_pos } => Some(*read_seq_pos), + MDPos::Match { + read_seq_pos, + ref_pos, + } => Some(*read_seq_pos), + MDPos::Mismatch { + ref_nt, + read_seq_pos, + ref_pos, + } => Some(*read_seq_pos), + MDPos::Insert { + read_seq_pos, + ref_pos_next, + } => Some(*read_seq_pos), + MDPos::Delete { + ref_nt, + read_seq_pos_next, + ref_pos, + } => None, + MDPos::SoftClip { read_seq_pos } => Some(*read_seq_pos), } } @@ -448,11 +502,25 @@ impl MDPos { /// original input sequence for reverse-strand alignments. pub fn read_seq_pos_or_next(&self) -> u32 { match self { - MDPos::Match{ read_seq_pos, ref_pos } => *read_seq_pos, - MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => *read_seq_pos, - MDPos::Insert{ read_seq_pos, ref_pos_next } => *read_seq_pos, - MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => *read_seq_pos_next, - MDPos::SoftClip{ read_seq_pos } => *read_seq_pos, + MDPos::Match { + read_seq_pos, + ref_pos, + } => *read_seq_pos, + MDPos::Mismatch { + ref_nt, + read_seq_pos, + ref_pos, + } => *read_seq_pos, + MDPos::Insert { + read_seq_pos, + ref_pos_next, + } => *read_seq_pos, + MDPos::Delete { + ref_nt, + read_seq_pos_next, + ref_pos, + } => *read_seq_pos_next, + MDPos::SoftClip { read_seq_pos } => *read_seq_pos, } } @@ -462,11 +530,25 @@ impl MDPos { /// soft clipped positions. pub fn ref_pos(&self) -> Option { match self { - MDPos::Match{ read_seq_pos, ref_pos } => Some(*ref_pos), - MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*ref_pos), - MDPos::Insert{ read_seq_pos, ref_pos_next } => None, - MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => Some(*ref_pos), - MDPos::SoftClip{ read_seq_pos } => None, + MDPos::Match { + read_seq_pos, + ref_pos, + } => Some(*ref_pos), + MDPos::Mismatch { + ref_nt, + read_seq_pos, + ref_pos, + } => Some(*ref_pos), + MDPos::Insert { + read_seq_pos, + ref_pos_next, + } => None, + MDPos::Delete { + ref_nt, + read_seq_pos_next, + ref_pos, + } => Some(*ref_pos), + MDPos::SoftClip { read_seq_pos } => None, } } @@ -477,11 +559,25 @@ impl MDPos { /// `None` is returned for soft clipped positions. pub fn ref_pos_or_next(&self) -> Option { match self { - MDPos::Match{ read_seq_pos, ref_pos } => Some(*ref_pos), - MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*ref_pos), - MDPos::Insert{ read_seq_pos, ref_pos_next } => Some(*ref_pos_next), - MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => Some(*ref_pos), - MDPos::SoftClip{ read_seq_pos } => None, + MDPos::Match { + read_seq_pos, + ref_pos, + } => Some(*ref_pos), + MDPos::Mismatch { + ref_nt, + read_seq_pos, + ref_pos, + } => Some(*ref_pos), + MDPos::Insert { + read_seq_pos, + ref_pos_next, + } => Some(*ref_pos_next), + MDPos::Delete { + ref_nt, + read_seq_pos_next, + ref_pos, + } => Some(*ref_pos), + MDPos::SoftClip { read_seq_pos } => None, } } @@ -491,17 +587,31 @@ impl MDPos { /// clipped positions. pub fn ref_nt(&self) -> Option { match self { - MDPos::Match{ read_seq_pos, ref_pos } => None, - MDPos::Mismatch{ ref_nt, read_seq_pos, ref_pos } => Some(*ref_nt), - MDPos::Insert{ read_seq_pos, ref_pos_next } => None, - MDPos::Delete{ ref_nt, read_seq_pos_next, ref_pos } => Some(*ref_nt), - MDPos::SoftClip{ read_seq_pos } => None, + MDPos::Match { + read_seq_pos, + ref_pos, + } => None, + MDPos::Mismatch { + ref_nt, + read_seq_pos, + ref_pos, + } => Some(*ref_nt), + MDPos::Insert { + read_seq_pos, + ref_pos_next, + } => None, + MDPos::Delete { + ref_nt, + read_seq_pos_next, + ref_pos, + } => Some(*ref_nt), + MDPos::SoftClip { read_seq_pos } => None, } } } /// Iterator over the `MDPos` positions represented by a BAM -/// record. +/// record. /// /// Note that these positions are generated in reference sequence /// order. For a reverse-strand alignment, they run from the last to @@ -516,38 +626,45 @@ pub struct MDPosIter { impl MDPosIter { /// Create a new iterator for a BAM record. - /// + /// /// # Arguments /// /// * `record` is the BAM record whose alignment will be extracted - pub fn new_from_record(record: &bam::record::Record) -> Result { + pub fn new_from_record(record: &bam::record::Record) -> Result { let mut md_stack = MatchDesc::parse_from_record(record)?; md_stack.reverse(); - + let CigarString(mut cigar_stack) = (*record.cigar()).clone(); cigar_stack.reverse(); - - Ok( MDPosIter { md_stack: md_stack, - cigar_stack: cigar_stack, - ref_pos_curr: record.pos() as u32, - read_pos_curr: 0 } ) + + Ok(MDPosIter { + md_stack: md_stack, + cigar_stack: cigar_stack, + ref_pos_curr: record.pos() as u32, + read_pos_curr: 0, + }) } // Utility function that yields the next MDPos. // Requires the cigar stack is non-empty // Requires that 0-length matches and non-yielding cigar entries // are cleared from the tops of those respective stacks. - fn next_with_some(&mut self) -> Result - { + fn next_with_some(&mut self) -> Result { let pop_cigar; let res = match self.cigar_stack.last_mut().unwrap() { Cigar::Match(ref mut ciglen) => { let pop_md; - let mmm = match self.md_stack.last_mut().ok_or_else(|| MDAlignError::MDvsCIGAR)? { + let mmm = match self + .md_stack + .last_mut() + .ok_or_else(|| MDAlignError::MDvsCIGAR)? + { MatchDesc::Matches(ref mut mdlen) => { - let pos = MDPos::Match { read_seq_pos: self.read_pos_curr, - ref_pos: self.ref_pos_curr, }; + let pos = MDPos::Match { + read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, + }; self.read_pos_curr += 1; self.ref_pos_curr += 1; *mdlen -= 1; @@ -555,64 +672,78 @@ impl MDPosIter { *ciglen -= 1; pop_cigar = *ciglen == 0; pos - }, + } MatchDesc::Mismatch(ref_nt) => { - let pos = MDPos::Mismatch { ref_nt: *ref_nt, - read_seq_pos: self.read_pos_curr, - ref_pos: self.ref_pos_curr, }; + let pos = MDPos::Mismatch { + ref_nt: *ref_nt, + read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, + }; self.read_pos_curr += 1; self.ref_pos_curr += 1; pop_md = true; *ciglen -= 1; pop_cigar = *ciglen == 0; pos - }, + } _ => { - return Err( MDAlignError::MDvsCIGAR ); + return Err(MDAlignError::MDvsCIGAR); } }; if pop_md { self.md_stack.pop(); } mmm - }, + } Cigar::Equal(ref mut ciglen) => { let pop_md; - let m = match self.md_stack.last_mut().ok_or_else(|| MDAlignError::MDvsCIGAR)? { + let m = match self + .md_stack + .last_mut() + .ok_or_else(|| MDAlignError::MDvsCIGAR)? + { MatchDesc::Matches(ref mut mdlen) => { - let pos = MDPos::Match { read_seq_pos: self.read_pos_curr, - ref_pos: self.ref_pos_curr, }; + let pos = MDPos::Match { + read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, + }; self.read_pos_curr += 1; self.ref_pos_curr += 1; - + *mdlen -= 1; pop_md = *mdlen == 0; *ciglen -= 1; pop_cigar = *ciglen == 0; pos - }, - _ => return Err( MDAlignError::MDvsCIGAR ), + } + _ => return Err(MDAlignError::MDvsCIGAR), }; if pop_md { self.md_stack.pop(); } m - }, - Cigar::Diff(ref mut ciglen) => { + } + Cigar::Diff(ref mut ciglen) => { let pop_md; - let mm = match self.md_stack.last_mut().ok_or_else(|| MDAlignError::MDvsCIGAR)? { + let mm = match self + .md_stack + .last_mut() + .ok_or_else(|| MDAlignError::MDvsCIGAR)? + { MatchDesc::Mismatch(ref_nt) => { - let pos = MDPos::Mismatch { ref_nt: *ref_nt, - read_seq_pos: self.read_pos_curr, - ref_pos: self.ref_pos_curr, }; + let pos = MDPos::Mismatch { + ref_nt: *ref_nt, + read_seq_pos: self.read_pos_curr, + ref_pos: self.ref_pos_curr, + }; self.read_pos_curr += 1; self.ref_pos_curr += 1; pop_md = true; *ciglen -= 1; pop_cigar = *ciglen == 0; pos - }, - _ => return Err( MDAlignError::MDvsCIGAR ), + } + _ => return Err(MDAlignError::MDvsCIGAR), }; if pop_md { self.md_stack.pop(); @@ -620,13 +751,15 @@ impl MDPosIter { mm } Cigar::Ins(ref mut len) => { - let mut pos = MDPos::Insert { read_seq_pos: self.read_pos_curr, - ref_pos_next: self.ref_pos_curr }; + let mut pos = MDPos::Insert { + read_seq_pos: self.read_pos_curr, + ref_pos_next: self.ref_pos_curr, + }; self.read_pos_curr += 1; *len -= 1; pop_cigar = *len == 0; pos - }, + } Cigar::Del(ref mut len) => { let pop_md; @@ -635,16 +768,18 @@ impl MDPosIter { if ref_nts.len() > 0 { let ref_nt = ref_nts.remove(0); pop_md = ref_nts.is_empty(); - let pos = MDPos::Delete { ref_nt: ref_nt, - ref_pos: self.ref_pos_curr, - read_seq_pos_next: self.read_pos_curr }; + let pos = MDPos::Delete { + ref_nt: ref_nt, + ref_pos: self.ref_pos_curr, + read_seq_pos_next: self.read_pos_curr, + }; self.ref_pos_curr += 1; pos } else { - return Err( MDAlignError::MDvsCIGAR ); + return Err(MDAlignError::MDvsCIGAR); } - }, - _ => return Err( MDAlignError::MDvsCIGAR ), + } + _ => return Err(MDAlignError::MDvsCIGAR), }; *len -= 1; pop_cigar = *len == 0; @@ -652,31 +787,33 @@ impl MDPosIter { self.md_stack.pop(); } del - }, + } Cigar::SoftClip(ref mut len) => { - let pos = MDPos::SoftClip { read_seq_pos: self.read_pos_curr }; + let pos = MDPos::SoftClip { + read_seq_pos: self.read_pos_curr, + }; self.read_pos_curr += 1; *len -= 1; pop_cigar = *len == 0; pos - }, - Cigar::RefSkip(_) => panic!("RefSkip in next_with_some"), + } + Cigar::RefSkip(_) => panic!("RefSkip in next_with_some"), Cigar::HardClip(_) => panic!("HardClip in next_with_some"), - Cigar::Pad(_) => panic!("Pad in next_with_some"), + Cigar::Pad(_) => panic!("Pad in next_with_some"), }; if pop_cigar { self.cigar_stack.pop(); } - Ok( res ) + Ok(res) } /// Collect the alignment positions from this record into an /// `Alignment`. This is always a semi-global alignment that /// includes the soft clipping from the read as `Xclip` operations. - pub fn collect_into_alignment(self) -> Result { - let mds_result: Result,MDAlignError> = self.collect(); + pub fn collect_into_alignment(self) -> Result { + let mds_result: Result, MDAlignError> = self.collect(); let mds = mds_result?; let mut iter = mds.as_slice().into_iter(); @@ -685,7 +822,11 @@ impl MDPosIter { let mut curr = iter.next(); let mut left_clip = 0; - while curr.ok_or_else(||MDAlignError::EmptyAlign)?.ref_pos_or_next().is_none() { + while curr + .ok_or_else(|| MDAlignError::EmptyAlign)? + .ref_pos_or_next() + .is_none() + { left_clip += 1; curr = iter.next(); } @@ -702,27 +843,39 @@ impl MDPosIter { while let Some(md) = curr { match md { - MDPos::Match{read_seq_pos, ref_pos} => { + MDPos::Match { + read_seq_pos, + ref_pos, + } => { ops.push(AlignmentOperation::Match); xend = *read_seq_pos; yend = *ref_pos; - }, - MDPos::Mismatch{ref_nt: _, read_seq_pos, ref_pos} => { + } + MDPos::Mismatch { + ref_nt: _, + read_seq_pos, + ref_pos, + } => { ops.push(AlignmentOperation::Subst); xend = *read_seq_pos; yend = *ref_pos; - }, - MDPos::Insert{read_seq_pos, ref_pos_next: _} => { + } + MDPos::Insert { + read_seq_pos, + ref_pos_next: _, + } => { ops.push(AlignmentOperation::Ins); xend = *read_seq_pos; - }, - MDPos::Delete{ref_nt: _, read_seq_pos_next: _, ref_pos} => { + } + MDPos::Delete { + ref_nt: _, + read_seq_pos_next: _, + ref_pos, + } => { ops.push(AlignmentOperation::Del); yend = *ref_pos; - }, - MDPos::SoftClip{read_seq_pos: _} => { - break - }, + } + MDPos::SoftClip { read_seq_pos: _ } => break, } curr = iter.next(); } @@ -731,28 +884,28 @@ impl MDPosIter { let mut xlen = xend; while let Some(md) = curr { match md { - MDPos::SoftClip{read_seq_pos} => { + MDPos::SoftClip { read_seq_pos } => { xlen = *read_seq_pos; - }, + } _ => { - return Err( MDAlignError::BadMD ); - }, + return Err(MDAlignError::BadMD); + } }; right_clip += 1; curr = iter.next(); } ops.push(AlignmentOperation::Xclip(right_clip)); - Ok( Alignment { + Ok(Alignment { score: 0, - xstart: xstart as usize, + xstart: xstart as usize, ystart: ystart as usize, xend: xend as usize, yend: yend as usize, ylen: (1 + yend - ystart) as usize, xlen: xlen as usize, operations: ops, - mode: AlignmentMode::Semiglobal + mode: AlignmentMode::Semiglobal, }) } } @@ -760,15 +913,14 @@ impl MDPosIter { impl Iterator for MDPosIter { type Item = Result; - fn next(&mut self) -> Option - { + fn next(&mut self) -> Option { // Process non-yielding cigar entries loop { let handled = match self.cigar_stack.last() { Some(Cigar::RefSkip(len)) => { self.ref_pos_curr += *len; true - }, + } Some(Cigar::HardClip(_len)) => true, Some(Cigar::Pad(_len)) => true, _ => false, @@ -779,7 +931,7 @@ impl Iterator for MDPosIter { break; } } - + // Consume 0-length match while self.md_stack.last() == Some(&MatchDesc::Matches(0)) { self.md_stack.pop(); @@ -788,17 +940,20 @@ impl Iterator for MDPosIter { if self.cigar_stack.is_empty() { None } else { - Some( self.next_with_some().map_err(|e| ::std::convert::From::from(e)) ) + Some( + self.next_with_some() + .map_err(|e| ::std::convert::From::from(e)), + ) } } } /// Abstract representation of entries in an MD aux field -#[derive(Debug,Clone,PartialEq,Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum MatchDesc { Matches(u32), Mismatch(u8), - Deletion(Vec) + Deletion(Vec), } impl MatchDesc { @@ -807,7 +962,9 @@ impl MatchDesc { /// # Arguments /// /// * `len` is the length of the match - pub fn new_matches(len: u32) -> Self { MatchDesc::Matches(len) } + pub fn new_matches(len: u32) -> Self { + MatchDesc::Matches(len) + } /// Create a new mismatched MD entry. /// @@ -816,15 +973,14 @@ impl MatchDesc { /// * `refnt` is the reference base at the mismatched position /// /// # Errors - /// + /// /// If `refnt` is not an upper-case ASCII character, an error /// variant is returned. - pub fn new_mismatch(refnt: u8) -> Result - { + pub fn new_mismatch(refnt: u8) -> Result { if refnt.is_ascii_uppercase() { - Ok( MatchDesc::Mismatch(refnt) ) + Ok(MatchDesc::Mismatch(refnt)) } else { - Err( MDAlignError::BadMD ) + Err(MDAlignError::BadMD) } } @@ -838,36 +994,32 @@ impl MatchDesc { /// /// If `refnts` is empty or contains invalid (non upper-case /// ASCII) characters, an error variant is returned. - pub fn new_deletion(refnts: &[u8]) -> Result - { + pub fn new_deletion(refnts: &[u8]) -> Result { if refnts.len() > 0 && refnts.iter().all(|&c| c.is_ascii_uppercase()) { - Ok( MatchDesc::Deletion(refnts.to_vec()) ) + Ok(MatchDesc::Deletion(refnts.to_vec())) } else { - Err( MDAlignError::BadMD ) + Err(MDAlignError::BadMD) } } /// True if this `MatchDesc` represents a run of matches - pub fn is_matches(&self) -> bool - { - match self { - MatchDesc::Matches(_) => true, + pub fn is_matches(&self) -> bool { + match self { + MatchDesc::Matches(_) => true, _ => false, - } + } } /// True if this `MatchDesc` represents a mismatch - pub fn is_mismatch(&self) -> bool - { - match self { + pub fn is_mismatch(&self) -> bool { + match self { MatchDesc::Mismatch(_) => true, _ => false, } } /// True if this `MatchDesc` represents a read deletion relative to the reference - pub fn is_deletion(&self) -> bool - { + pub fn is_deletion(&self) -> bool { match self { MatchDesc::Deletion(_) => true, _ => false, @@ -877,25 +1029,25 @@ impl MatchDesc { /// Create a `Vec` of `MatchDesc` entries corresponding to the MD aux field on a record. /// /// # Arguments - /// + /// /// * `record` is the source of the MD aux field to parse /// /// # Errors - /// - /// If `record` has no MD aux field, or if the value of the aux field is malformed, + /// + /// If `record` has no MD aux field, or if the value of the aux field is malformed, /// an error variant will be returned. - pub fn parse_from_record(record: &bam::record::Record) -> Result,MDAlignError> { + pub fn parse_from_record(record: &bam::record::Record) -> Result, MDAlignError> { Self::parse(Self::get_md(record)?) } /// Parses a bytestring as an MD aux field description. /// /// # Arguments - /// + /// /// * `mdstring` is a bytestring MD aux field /// /// # Errors - /// + /// /// If the `mdstring` is not a well-formed MD aux field, an error variant is returned. /// /// # Examples @@ -904,7 +1056,7 @@ impl MatchDesc { /// use rust_htslib::bam::md_align::{MatchDesc,MDAlignError}; /// # fn try_main() -> Result<(), MDAlignError> { /// assert_eq!(MatchDesc::parse(b"10A5^AC6")?, - /// [MatchDesc::Matches(10), + /// [MatchDesc::Matches(10), /// MatchDesc::Mismatch(b'A'), /// MatchDesc::Matches(5), /// MatchDesc::Deletion(b"AC".to_vec()), @@ -913,7 +1065,7 @@ impl MatchDesc { /// # } /// # fn main() { try_main().unwrap(); } /// ``` - pub fn parse(mdstring: &[u8]) -> Result,MDAlignError> { + pub fn parse(mdstring: &[u8]) -> Result, MDAlignError> { MatchDescIter::new(mdstring).collect() } @@ -965,7 +1117,7 @@ impl MatchDesc { pub fn unparse(mds: &[MatchDesc], strict_match0: bool) -> String { let mut mdstr = String::new(); let mut match_accum = 0; - + let mut md_iter = mds.iter().peekable(); while let Some(md) = md_iter.next() { match md { @@ -976,63 +1128,68 @@ impl MatchDesc { write!(&mut mdstr, "{}", match_accum + mlen).unwrap(); match_accum = 0; } - }, + } MatchDesc::Mismatch(refnt) => { if strict_match0 && md_iter.peek().map_or(true, |next| !next.is_matches()) { write!(&mut mdstr, "{}0", *refnt as char).unwrap(); } else { write!(&mut mdstr, "{}", *refnt as char).unwrap(); } - }, + } MatchDesc::Deletion(refnts) => { if strict_match0 { if md_iter.peek().map_or(true, |next| !next.is_matches()) { write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); - } else { + } else { write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); } } else { if md_iter.peek().map_or(false, |next| next.is_mismatch()) { - write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); + write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); } else { - write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); + write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); } } } }; } - + mdstr } /// Extract an MD aux field bytestring from a BAM record. /// /// # Arguments - /// + /// /// * `rec` is the record whose MD aux field is extracted /// /// # Errors /// /// If `rec` does not have a string type MD aux field, an error variant is returned. - pub fn get_md(rec: &bam::record::Record) -> Result<&[u8],MDAlignError> { - rec.aux(b"MD").map_or_else(|| Err(MDAlignError::NoMD), - |aux| match aux { bam::record::Aux::String(md) => Ok(md), _ => Err(MDAlignError::NoMD) }) + pub fn get_md(rec: &bam::record::Record) -> Result<&[u8], MDAlignError> { + rec.aux(b"MD").map_or_else( + || Err(MDAlignError::NoMD), + |aux| match aux { + bam::record::Aux::String(md) => Ok(md), + _ => Err(MDAlignError::NoMD), + }, + ) } } /// Iterator over `MatchDesc` entries in an MD aux field -pub struct MatchDescIter<'a> { - md: &'a [u8] +pub struct MatchDescIter<'a> { + md: &'a [u8], } impl<'a> MatchDescIter<'a> { /// Create a new iterator over an MD aux field string /// /// # Arguments - /// + /// /// * `md` is a bytestring of an MD aux field entry pub fn new(md: &'a [u8]) -> Self { - MatchDescIter{ md: md } + MatchDescIter { md: md } } /// Create a new iterator over the MD aux field from a BAM record @@ -1045,53 +1202,60 @@ impl<'a> MatchDescIter<'a> { /// /// If the record does not have a string-valued MD aux field, an /// error variant is returned. - pub fn new_from_record(record: &'a bam::record::Record) -> Result { - Ok( Self::new(MatchDesc::get_md(record)?) ) + pub fn new_from_record(record: &'a bam::record::Record) -> Result { + Ok(Self::new(MatchDesc::get_md(record)?)) } // Requires self.md is non-empty, guaranteed to yield a MatchDesc - fn next_nonempty(&mut self) -> Result - { + fn next_nonempty(&mut self) -> Result { let ch = self.md.first().unwrap(); if ch.is_ascii_digit() { - let endpos = self.md.iter().position(|&c| !c.is_ascii_digit()).unwrap_or(self.md.len()); + let endpos = self + .md + .iter() + .position(|&c| !c.is_ascii_digit()) + .unwrap_or(self.md.len()); let numstr = str::from_utf8(&self.md[0..endpos])?; self.md = &self.md[endpos..]; - Ok( MatchDesc::Matches(numstr.parse()?) ) + Ok(MatchDesc::Matches(numstr.parse()?)) } else if *ch == b'^' { - let endpos = self.md.iter().skip(1).position(|&c| !c.is_ascii_uppercase()).unwrap_or(self.md.len() - 1) + 1; + let endpos = self + .md + .iter() + .skip(1) + .position(|&c| !c.is_ascii_uppercase()) + .unwrap_or(self.md.len() - 1) + 1; let del = MatchDesc::Deletion(self.md[1..endpos].to_vec()); self.md = &self.md[endpos..]; - Ok( del ) + Ok(del) } else if ch.is_ascii_uppercase() { self.md = &self.md[1..]; - Ok( MatchDesc::Mismatch(*ch) ) + Ok(MatchDesc::Mismatch(*ch)) } else { - Err( MDAlignError::BadMD ) + Err(MDAlignError::BadMD) } } } impl<'a> Iterator for MatchDescIter<'a> { - type Item = Result; + type Item = Result; - fn next(&mut self) -> Option - { + fn next(&mut self) -> Option { if self.md.is_empty() { None } else { - Some( self.next_nonempty() ) + Some(self.next_nonempty()) } } } quick_error! { - #[derive(Debug,Clone)] + #[derive(Debug,Clone)] pub enum MDAlignError { NoMD { description("no MD aux field") } - BadMD { + BadMD { description("bad MD value") } MDvsCIGAR { @@ -1113,76 +1277,109 @@ quick_error! { } fn fast_compl(nt: u8) -> u8 { - unsafe { - *COMPL.get_unchecked((nt as usize) & 0x7f) - } + unsafe { *COMPL.get_unchecked((nt as usize) & 0x7f) } } const NT_NONE: u8 = b'*'; -static COMPL: [u8; 128] = - [ NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, -// 0x10 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, -// 0x20 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, -// 0x30 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, -// 0x40 - NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, -// 0x50 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, -// 0x60 - NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, -// 0x70 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE]; +static COMPL: [u8; 128] = [ + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, // 0x10 + NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, // 0x20 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + // 0x30 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, // 0x40 + NT_NONE, b'T', + NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, b'N', NT_NONE, // 0x50 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + // 0x60 + NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, // 0x70 + NT_NONE, NT_NONE, NT_NONE, + NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, +]; #[cfg(test)] mod tests { - use std::path::Path; - use super::*; use super::super::*; + use super::*; + use std::path::Path; const N_TEST: usize = 14; - static TEST_READ_NAMES: [&[u8]; N_TEST] - = [ b"K00364:89:HWTF2BBXX:2:1102:3376:38293", - b"K00364:89:HWTF2BBXX:2:1102:3092:38293", - b"K00364:89:HWTF2BBXX:2:1102:3356:38293", - b"K00364:89:HWTF2BBXX:2:1102:3640:38293", - b"K00364:89:HWTF2BBXX:2:1102:20994:38293", - b"K00364:89:HWTF2BBXX:2:1102:18852:38310", - b"K00364:89:HWTF2BBXX:2:1102:7395:38363", - b"K00364:89:HWTF2BBXX:2:1102:12560:38310", - b"K00364:89:HWTF2BBXX:2:1102:13017:38328", - b"K00364:89:HWTF2BBXX:2:1102:31324:38293", - b"K00364:89:HWTF2BBXX:2:1102:27630:38293", - b"K00364:89:HWTF2BBXX:2:1102:15240:38310", - b"K00364:89:HWTF2BBXX:2:1102:3062:38310", - b"K00364:89:HWTF2BBXX:2:1102:30371:38293", - ]; - - static TEST_CIGAR: [&str; N_TEST] = - [ "151M", "151M", "151M", "151M", "149M2S", "149M2S", - "2S149M", "2S149M", "44M2D107M", "133M2D11M7S", - "47M1I103M", "103M1I47M", "53M99N98M", "48M1998N103M" ]; - - static TEST_MD: [&[u8]; N_TEST] = - [ b"151", b"151", b"41T87T21", b"137A13", b"96T52", b"93A0A54", - b"127T21", b"149", b"44^GT107", b"8A68A26A15A12^TG1T9", - b"76T73", b"150", b"151", b"75A75" ]; - - static TEST_ALIGN_CIGAR: [&str; N_TEST] = - [ "151=", "151=", "41=1X87=1X21=", "137=1X13=", "96=1X52=2S", "93=2X54=2S", - "2S127=1X21=", "2S149=", "44=2D107=", "8=1X68=1X26=1X15=1X12=2D1=1X9=7S", - "47=1I29=1X73=", "103=1I47=", "151=", "75=1X75=" ]; + static TEST_READ_NAMES: [&[u8]; N_TEST] = [ + b"K00364:89:HWTF2BBXX:2:1102:3376:38293", + b"K00364:89:HWTF2BBXX:2:1102:3092:38293", + b"K00364:89:HWTF2BBXX:2:1102:3356:38293", + b"K00364:89:HWTF2BBXX:2:1102:3640:38293", + b"K00364:89:HWTF2BBXX:2:1102:20994:38293", + b"K00364:89:HWTF2BBXX:2:1102:18852:38310", + b"K00364:89:HWTF2BBXX:2:1102:7395:38363", + b"K00364:89:HWTF2BBXX:2:1102:12560:38310", + b"K00364:89:HWTF2BBXX:2:1102:13017:38328", + b"K00364:89:HWTF2BBXX:2:1102:31324:38293", + b"K00364:89:HWTF2BBXX:2:1102:27630:38293", + b"K00364:89:HWTF2BBXX:2:1102:15240:38310", + b"K00364:89:HWTF2BBXX:2:1102:3062:38310", + b"K00364:89:HWTF2BBXX:2:1102:30371:38293", + ]; + + static TEST_CIGAR: [&str; N_TEST] = [ + "151M", + "151M", + "151M", + "151M", + "149M2S", + "149M2S", + "2S149M", + "2S149M", + "44M2D107M", + "133M2D11M7S", + "47M1I103M", + "103M1I47M", + "53M99N98M", + "48M1998N103M", + ]; + + static TEST_MD: [&[u8]; N_TEST] = [ + b"151", + b"151", + b"41T87T21", + b"137A13", + b"96T52", + b"93A0A54", + b"127T21", + b"149", + b"44^GT107", + b"8A68A26A15A12^TG1T9", + b"76T73", + b"150", + b"151", + b"75A75", + ]; + + static TEST_ALIGN_CIGAR: [&str; N_TEST] = [ + "151=", + "151=", + "41=1X87=1X21=", + "137=1X13=", + "96=1X52=2S", + "93=2X54=2S", + "2S127=1X21=", + "2S149=", + "44=2D107=", + "8=1X68=1X26=1X15=1X12=2D1=1X9=7S", + "47=1I29=1X73=", + "103=1I47=", + "151=", + "75=1X75=", + ]; static TEST_READ_LINE: [&str; N_TEST] = ["TATCTCTGCTATGGTCGATGCTGCTAAGGATCAAGTTGCCCTAAGAATGCCAGAATTGATTCCAGTCTTGTCTGAAACCATGTGGGACACCAAGAAGGAAGTCAAGGCTGCTGCTACTGCCGCCATGACCAAGGCTACCGAAACTGTTGAC", @@ -1199,7 +1396,7 @@ mod tests { "CGTCAGCATCGTGGTAGTCAAAAAGTTCATCTGCACCGTACTCTTTCAACAATTTTTCATGTTTACGAGAAGCAACGACGATGATCTTGCTGAAACCGTTTAGTTTTTTTTGCCAATTGAATAAGCATCTGGCCAACAGCAGTGGCACCAC", "ACAAACCATGAATCCTTTGTAGGATCAATAAACGATCCTGAAGATGGACCTGCTGTTGCACATACTGTTTATTTGGCTGCCTTGGTATACCTCGTGTTTTTCGTATTCTGTGGGTTCCAAGTTTACCTAGCCAGAAGAAAACCTTCGATCG", "CGACATATGGAGATACTTTATTTCCTTTTCTTAATTATTAACGTATACCTATAAATTAACAAAGTATCTAAACAAGATACATAAGTGTACTCAAACTGAGTAGAATCGTCGATTAAACTTCCTTCTCCTTTTAAAAATTAAAAACAGCAAA"]; - + static TEST_REF_LINE: [&str; N_TEST] = ["TATCTCTGCTATGGTCGATGCTGCTAAGGATCAAGTTGCCCTAAGAATGCCAGAATTGATTCCAGTCTTGTCTGAAACCATGTGGGACACCAAGAAGGAAGTCAAGGCTGCTGCTACTGCCGCCATGACCAAGGCTACCGAAACTGTTGAC", "CATAGCAGTGGTTTCAGAAATGTTGGCAGACATTCTGTGGAAAACAGTGAAGTCACCGTTACCCAAGGTGTGGTGCAACAACAATTGCTTAGCTTGAGCAGAGATGGATGGGACACCAACAACGTGCAAAACACCGACGTGTTCAGCGTAA", @@ -1226,27 +1423,39 @@ mod tests { let mut rec = record.ok().expect("Error reading BAM record"); assert_eq!(rec.qname(), TEST_READ_NAMES[i]); - assert_eq!(MatchDesc::get_md(&rec).ok().expect("No MD field"), TEST_MD[i]); + assert_eq!( + MatchDesc::get_md(&rec).ok().expect("No MD field"), + TEST_MD[i] + ); assert_eq!(rec.cigar().to_string(), TEST_CIGAR[i]); - let ap_iter = AlignPosIter::new_from_record(&rec).ok().expect("Creating AlignPosIter"); - let ap_res: Result,MDAlignError> = ap_iter.collect(); + let ap_iter = AlignPosIter::new_from_record(&rec) + .ok() + .expect("Creating AlignPosIter"); + let ap_res: Result, MDAlignError> = ap_iter.collect(); let aps = ap_res.ok().expect("Unable to create Vec"); let ap_read_line: String = aps.iter().map(|ap| ap.read_line_char()).collect(); assert_eq!(ap_read_line, TEST_READ_LINE[i]); let ap_ref_line: String = aps.iter().map(|ap| ap.ref_line_char()).collect(); assert_eq!(ap_ref_line, TEST_REF_LINE[i]); - let rap_iter = RecAlignPosIter::new(&rec).ok().expect("Creating RecAlignPosIter"); - let rap_res: Result,MDAlignError> = rap_iter.collect(); + let rap_iter = RecAlignPosIter::new(&rec) + .ok() + .expect("Creating RecAlignPosIter"); + let rap_res: Result, MDAlignError> = rap_iter.collect(); let raps = rap_res.ok().expect("Unable to create Vec"); let rap_read_line: String = raps.iter().map(|rap| rap.read_line_char()).collect(); assert_eq!(rap_read_line, TEST_READ_LINE[i]); let rap_ref_line: String = raps.iter().map(|rap| rap.ref_line_char()).collect(); assert_eq!(rap_ref_line, TEST_REF_LINE[i]); - let md_iter = MDPosIter::new_from_record(&rec).ok().expect("Creating MDPosIter"); - let alignment = md_iter.collect_into_alignment().ok().expect("collecting into alignment"); + let md_iter = MDPosIter::new_from_record(&rec) + .ok() + .expect("Creating MDPosIter"); + let alignment = md_iter + .collect_into_alignment() + .ok() + .expect("collecting into alignment"); let a_cigar = alignment.cigar(false); assert_eq!(a_cigar, TEST_ALIGN_CIGAR[i]); } From 0e98f8cf2fe43bd63aec05ccaebea5229646487d Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Tue, 4 Sep 2018 19:10:49 -0700 Subject: [PATCH 03/16] Unmangle table --- src/bam/md_align.rs | 48 +++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 4c64b6896..b12b39d85 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1281,29 +1281,31 @@ fn fast_compl(nt: u8) -> u8 { } const NT_NONE: u8 = b'*'; -static COMPL: [u8; 128] = [ - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, // 0x10 - NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, // 0x20 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - // 0x30 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, // 0x40 - NT_NONE, b'T', - NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, b'N', NT_NONE, // 0x50 - NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - // 0x60 - NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', NT_NONE, NT_NONE, - NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, // 0x70 - NT_NONE, NT_NONE, NT_NONE, - NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, - NT_NONE, NT_NONE, -]; +#[cfg_attr(rustfmt, rustfmt_skip)] +static COMPL: [u8; 128] = + [ NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x10 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x20 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x30 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x40 + NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, +// 0x50 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, +// 0x60 + NT_NONE, b'T', NT_NONE, b'G', NT_NONE, NT_NONE, NT_NONE, b'C', + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'N', NT_NONE, +// 0x70 + NT_NONE, NT_NONE, NT_NONE, NT_NONE, b'A', NT_NONE, NT_NONE, NT_NONE, + NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE, NT_NONE]; #[cfg(test)] mod tests { From 93c268ab80f6c3ba5380ebf4af65d1ccc4198e2b Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Sun, 23 Sep 2018 10:47:15 -0700 Subject: [PATCH 04/16] Moved MD accessor convenience function onto record --- src/bam/md_align.rs | 25 +++---------------------- src/bam/record.rs | 13 +++++++++++++ 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index b12b39d85..316724a3d 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1037,7 +1037,7 @@ impl MatchDesc { /// If `record` has no MD aux field, or if the value of the aux field is malformed, /// an error variant will be returned. pub fn parse_from_record(record: &bam::record::Record) -> Result, MDAlignError> { - Self::parse(Self::get_md(record)?) + Self::parse(record.aux_md().ok_or_else(|| MDAlignError::NoMD)?) } /// Parses a bytestring as an MD aux field description. @@ -1156,25 +1156,6 @@ impl MatchDesc { mdstr } - - /// Extract an MD aux field bytestring from a BAM record. - /// - /// # Arguments - /// - /// * `rec` is the record whose MD aux field is extracted - /// - /// # Errors - /// - /// If `rec` does not have a string type MD aux field, an error variant is returned. - pub fn get_md(rec: &bam::record::Record) -> Result<&[u8], MDAlignError> { - rec.aux(b"MD").map_or_else( - || Err(MDAlignError::NoMD), - |aux| match aux { - bam::record::Aux::String(md) => Ok(md), - _ => Err(MDAlignError::NoMD), - }, - ) - } } /// Iterator over `MatchDesc` entries in an MD aux field @@ -1203,7 +1184,7 @@ impl<'a> MatchDescIter<'a> { /// If the record does not have a string-valued MD aux field, an /// error variant is returned. pub fn new_from_record(record: &'a bam::record::Record) -> Result { - Ok(Self::new(MatchDesc::get_md(record)?)) + Ok(Self::new(record.aux_md().ok_or_else(|| MDAlignError::NoMD)?)) } // Requires self.md is non-empty, guaranteed to yield a MatchDesc @@ -1426,7 +1407,7 @@ mod tests { assert_eq!(rec.qname(), TEST_READ_NAMES[i]); assert_eq!( - MatchDesc::get_md(&rec).ok().expect("No MD field"), + rec.get_md().ok().expect("No MD field"), TEST_MD[i] ); assert_eq!(rec.cigar().to_string(), TEST_CIGAR[i]); diff --git a/src/bam/record.rs b/src/bam/record.rs index a03705b9a..ea427c6ed 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -590,6 +590,19 @@ impl Record { unset_supplementary, 2048u16 ); + + /// Shorthand to return MD aux field bytestring from a BAM + /// record. If the record doesn't have an MD field, or the field + /// isn't a string field, then `None` is returned. + pub fn aux_md(&self) -> Option<&[u8]> { + self.aux(b"MD").map_or( + None, + |aux| match aux { + Aux::String(md) => Some(md), + _ => None, + }, + ) + } } impl Drop for Record { From 8e3516007b04e49abae2dcd9c57a195e1762a3fc Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Sun, 23 Sep 2018 11:32:20 -0700 Subject: [PATCH 05/16] Fix up tests and fmt --- src/bam/md_align.rs | 9 ++++----- src/bam/record.rs | 11 ++++------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 316724a3d..ac611257f 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1184,7 +1184,9 @@ impl<'a> MatchDescIter<'a> { /// If the record does not have a string-valued MD aux field, an /// error variant is returned. pub fn new_from_record(record: &'a bam::record::Record) -> Result { - Ok(Self::new(record.aux_md().ok_or_else(|| MDAlignError::NoMD)?)) + Ok(Self::new( + record.aux_md().ok_or_else(|| MDAlignError::NoMD)?, + )) } // Requires self.md is non-empty, guaranteed to yield a MatchDesc @@ -1406,10 +1408,7 @@ mod tests { let mut rec = record.ok().expect("Error reading BAM record"); assert_eq!(rec.qname(), TEST_READ_NAMES[i]); - assert_eq!( - rec.get_md().ok().expect("No MD field"), - TEST_MD[i] - ); + assert_eq!(rec.aux_md().unwrap(), TEST_MD[i]); assert_eq!(rec.cigar().to_string(), TEST_CIGAR[i]); let ap_iter = AlignPosIter::new_from_record(&rec) diff --git a/src/bam/record.rs b/src/bam/record.rs index ea427c6ed..905766cf5 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -595,13 +595,10 @@ impl Record { /// record. If the record doesn't have an MD field, or the field /// isn't a string field, then `None` is returned. pub fn aux_md(&self) -> Option<&[u8]> { - self.aux(b"MD").map_or( - None, - |aux| match aux { - Aux::String(md) => Some(md), - _ => None, - }, - ) + self.aux(b"MD").map_or(None, |aux| match aux { + Aux::String(md) => Some(md), + _ => None, + }) } } From 5d199513c221b85998f2893e31c1a87c49d9ad84 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Mon, 24 Sep 2018 09:47:35 -0700 Subject: [PATCH 06/16] Simplified Cigar/MD reconstruction interface --- src/bam/md_align.rs | 510 +++++++++++++------------------------------- 1 file changed, 153 insertions(+), 357 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index ac611257f..d2841fedc 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -6,27 +6,60 @@ use bio_types::alignment::{Alignment, AlignmentMode, AlignmentOperation}; use bam; use bam::record::{Cigar, CigarString}; -/// Position in a BAM record alignment, based on CIGAR and MD -/// information. -pub trait MDAlignPos { +/// Alignment position on a BAM record, reconstructed using read +/// sequence information along with the Cigar string and the MD Aux +/// field. +pub struct MDAlignPos<'a> { + record: &'a bam::record::Record, + pos: CigarMDPos, +} + +impl<'a> MDAlignPos<'a> { + fn new(record: &'a bam::record::Record, pos: CigarMDPos) -> Self { + MDAlignPos { + record: record, + pos: pos, + } + } + + fn read_nt_at(&self, at: u32) -> u8 { + self.record.seq()[at as usize] + } + + fn qual_at(&self, at: u32) -> u8 { + self.record.qual()[at as usize] + } + /// Read nucleotide sequence as reported in the BAM /// alignment. _Note_ that this is the complement of the read /// sequence itself for reverse-strand alignments. /// /// `None` is returned for read deletions. - fn read_nt(&self) -> Option; + pub fn read_nt(&self) -> Option { + self.pos.read_seq_pos().map(|pos| self.read_nt_at(pos)) + } /// Reference nucleotide sequence as reported in the BAM alignment. /// /// `None` is returned for read insertions and /// soft clipped positions. - fn ref_nt(&self) -> Option; + pub fn ref_nt(&self) -> Option { + self.pos.ref_nt().or_else(|| { + if self.ref_pos().is_some() { + self.read_nt() + } else { + None + } + }) + } /// Read nucleotide quality score as reported in the BAM /// alignment. /// /// `None` is returned at read deletion positions. - fn read_qual(&self) -> Option; + pub fn read_qual(&self) -> Option { + self.pos.read_seq_pos().map(|pos| self.qual_at(pos)) + } /// Zero-based offset in the read sequence as reported in the BAM /// alignment. _Note_ that these positions will run from the last @@ -34,41 +67,65 @@ pub trait MDAlignPos { /// reverse-strand alignments. /// /// `None is returned for read deletions. - fn read_seq_pos(&self) -> Option; + pub fn read_seq_pos(&self) -> Option { + self.pos.read_seq_pos() + } /// Zero-based offset in the read sequence as reported in the BAM /// alignment. For read deletions, the offset of the next aligned /// read sequence nucleotide is reported. _Note_ that these /// positions will run from the last to the first base in the /// original input sequence for reverse-strand alignments. - fn read_seq_pos_or_next(&self) -> u32; + pub fn read_seq_pos_or_next(&self) -> u32 { + self.pos.read_seq_pos_or_next() + } /// Zero-based offset in the reference sequence. /// /// `None` is returned for read insertions and /// soft clipped positions. - fn ref_pos(&self) -> Option; + pub fn ref_pos(&self) -> Option { + self.pos.ref_pos() + } /// Zero-based offset in the reference sequence. For read /// insertions, the offset of the next aligned reference sequence /// nucleotide is reported. /// /// `None` is returned for soft clipped positions. - fn ref_pos_or_next(&self) -> Option; + pub fn ref_pos_or_next(&self) -> Option { + self.pos.ref_pos_or_next() + } /// Zero-based offset for this position in the original input read /// sequence. Position 0 is the first nucleotide from the input /// read sequence for either forward or reverse strand alignments. /// /// `None` is returned for read deletions. - fn read_true_pos(&self) -> Option; + pub fn read_true_pos(&self) -> Option { + self.read_seq_pos().map(|seq_pos| { + if self.record.is_reverse() { + self.record.qual().len() as u32 - (1 + seq_pos) + } else { + seq_pos + } + }) + } /// Nucleotide in the original input read sequence, taking into /// account the fact that reverse-strand alignments report the /// reverse-complement of the input read sequence. /// /// `None` is returned for read deletions. - fn read_true_nt(&self) -> Option; + pub fn read_true_nt(&self) -> Option { + self.read_nt().map(|nt| { + if self.record.is_reverse() { + fast_compl(nt) + } else { + nt + } + }) + } /// Nucleotide for the reference sequence that is matched against /// `read_true_nt()`. _Note_ that this is the complement of the @@ -77,13 +134,21 @@ pub trait MDAlignPos { /// /// `None` is returned for read insertions and soft-clipped /// positions. - fn ref_true_nt(&self) -> Option; + pub fn ref_true_nt(&self) -> Option { + self.ref_nt().map(|nt| { + if self.record.is_reverse() { + fast_compl(nt) + } else { + nt + } + }) + } /// Character for read sequence line in pretty-printed alignment /// format. This character is the read sequence base, in /// lower-case for soft-clipped positions, or a `-` for read /// deletions. - fn read_line_char(&self) -> char { + pub fn read_line_char(&self) -> char { if self.ref_pos_or_next().is_none() { self.read_nt() .map_or('-', |nt| nt.to_ascii_lowercase() as char) @@ -95,7 +160,7 @@ pub trait MDAlignPos { /// Character for middle match line in pretty-printed alignment /// format. This is a vertical bar at match positions and a space /// otherwise. - fn match_line_char(&self) -> char { + pub fn match_line_char(&self) -> char { let read = self.read_nt(); if read.is_some() && read == self.ref_nt() { '|' @@ -108,7 +173,7 @@ pub trait MDAlignPos { /// alignment format. This character is the reference sequence /// base when present, a `-` for read insertions, and a space for /// soft-clipped positions. - fn ref_line_char(&self) -> char { + pub fn ref_line_char(&self) -> char { self.ref_nt().map_or_else( || { if self.ref_pos_or_next().is_none() { @@ -122,324 +187,55 @@ pub trait MDAlignPos { } } -/// Self-contained data type containing all information about an -/// alignment position in a BAM record. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct AlignPos { - ref_nt: Option, - read_nt_qual: Option<(u8, u8)>, - is_reverse: bool, - read_seq_pos_or_next: u32, - ref_pos_or_next: Option, - read_len: u32, -} - -impl MDAlignPos for AlignPos { - fn read_nt(&self) -> Option { - self.read_nt_qual.as_ref().map(|(nt, _q)| *nt) - } - - fn ref_nt(&self) -> Option { - self.ref_nt - } - - fn read_qual(&self) -> Option { - self.read_nt_qual.as_ref().map(|(_nt, q)| *q) - } - - fn read_seq_pos(&self) -> Option { - if self.read_nt_qual.is_some() { - Some(self.read_seq_pos_or_next) - } else { - None - } - } - - fn read_seq_pos_or_next(&self) -> u32 { - self.read_seq_pos_or_next - } - - fn ref_pos(&self) -> Option { - if self.ref_nt.is_some() { - self.ref_pos_or_next - } else { - None - } - } - - fn ref_pos_or_next(&self) -> Option { - self.ref_pos_or_next - } - - fn read_true_pos(&self) -> Option { - self.read_seq_pos().map(|seq_pos| { - if self.is_reverse { - self.read_len - (1 + seq_pos) - } else { - seq_pos - } - }) - } - - fn read_true_nt(&self) -> Option { - self.read_nt().map(|seq_nt| { - if self.is_reverse { - fast_compl(seq_nt) - } else { - seq_nt - } - }) - } - - fn ref_true_nt(&self) -> Option { - self.ref_nt().map(|seq_nt| { - if self.is_reverse { - fast_compl(seq_nt) - } else { - seq_nt - } - }) - } -} - -/// Iterator that generates `AlignPos` alignment positions for a BAM +/// Iterator that generates `MDAlignPos` alignment positions for a BAM /// record. /// /// Note that these positions are generated in reference sequence /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. -pub struct AlignPosIter { - align_iter: MDPosIter, - read_seq: Vec, - read_qual: Vec, - is_reverse: bool, -} - -impl AlignPosIter { - /// Create a new iterator for a BAM record. - /// - /// # Arguments - /// - /// * `record` is the BAM record whose alignment will be extracted - pub fn new_from_record(record: &bam::record::Record) -> Result { - let align_iter = MDPosIter::new_from_record(record)?; - Ok(AlignPosIter { - align_iter: align_iter, - read_seq: record.seq().as_bytes(), - read_qual: record.qual().to_vec(), - is_reverse: record.is_reverse(), - }) - } - - fn pos_to_align_pos(&self, pos: MDPos) -> Result { - let read_seq_pos = pos.read_seq_pos(); - let ref_pos = pos.ref_pos(); - let read_nt_qual = match read_seq_pos { - Some(p) => Some(( - *self - .read_seq - .get(p as usize) - .ok_or_else(|| MDAlignError::BadSeqLen)?, - *self - .read_qual - .get(p as usize) - .ok_or_else(|| MDAlignError::BadSeqLen)?, - )), - None => None, - }; - // When we don't have a ref_nt but we do have a ref_pos, this should be a match - // and there should be a defined read_nt - let ref_nt = pos.ref_nt().or_else(|| { - if ref_pos.is_some() { - Some(read_nt_qual.unwrap().0) - } else { - None - } - }); - Ok(AlignPos { - ref_nt: ref_nt, - read_nt_qual: read_nt_qual, - is_reverse: self.is_reverse, - read_seq_pos_or_next: pos.read_seq_pos_or_next(), - ref_pos_or_next: pos.ref_pos_or_next(), - read_len: self.read_seq.len() as u32, - }) - } - - /// Collects all alignment positions into a _read strand_ vector - /// of alignment positions. This is right to left on the reference - /// sequence, for reverse strand alignments. - #[allow(unused_must_use)] - pub fn read_strand_alignment(&mut self) -> Result, MDAlignError> { - let mut aln: Result, MDAlignError> = self.collect(); - - if self.is_reverse { - aln.as_mut().map(|a| a.reverse()); - } - - aln - } -} - -impl Iterator for AlignPosIter { - type Item = Result; - - fn next(&mut self) -> Option { - self.align_iter.next().map(|res| { - let pos = res?; - self.pos_to_align_pos(pos) - }) - } -} - -/// Alignment position on a BAM record. Read sequence and quality -/// information can be extracted directly because a reference to the -/// original record is retained. -pub struct RecAlignPos<'a> { - record: &'a bam::record::Record, - pos: MDPos, -} - -impl<'a> RecAlignPos<'a> { - fn new(record: &'a bam::record::Record, pos: MDPos) -> Self { - RecAlignPos { - record: record, - pos: pos, - } - } - - fn read_nt_at(&self, at: u32) -> u8 { - self.record.seq()[at as usize] - } - - fn qual_at(&self, at: u32) -> u8 { - self.record.qual()[at as usize] - } -} - -impl<'a> MDAlignPos for RecAlignPos<'a> { - fn read_nt(&self) -> Option { - self.pos.read_seq_pos().map(|pos| self.read_nt_at(pos)) - } - - fn ref_nt(&self) -> Option { - self.pos.ref_nt().or_else(|| { - if self.ref_pos().is_some() { - self.read_nt() - } else { - None - } - }) - } - - fn read_qual(&self) -> Option { - self.pos.read_seq_pos().map(|pos| self.qual_at(pos)) - } - - fn read_seq_pos(&self) -> Option { - self.pos.read_seq_pos() - } - - fn read_seq_pos_or_next(&self) -> u32 { - self.pos.read_seq_pos_or_next() - } - - fn ref_pos(&self) -> Option { - self.pos.ref_pos() - } - - fn ref_pos_or_next(&self) -> Option { - self.pos.ref_pos_or_next() - } - - fn read_true_pos(&self) -> Option { - self.read_seq_pos().map(|seq_pos| { - if self.record.is_reverse() { - self.record.qual().len() as u32 - (1 + seq_pos) - } else { - seq_pos - } - }) - } - - fn read_true_nt(&self) -> Option { - self.read_nt().map(|nt| { - if self.record.is_reverse() { - fast_compl(nt) - } else { - nt - } - }) - } - - fn ref_true_nt(&self) -> Option { - self.ref_nt().map(|nt| { - if self.record.is_reverse() { - fast_compl(nt) - } else { - nt - } - }) - } -} - -/// Iterator that generates `RecAlignPos` alignment positions for a BAM -/// record. -/// -/// Note that these positions are generated in reference sequence -/// order. For a reverse-strand alignment, they run from the last to -/// the first sequenced base. -pub struct RecAlignPosIter<'a> { - align_iter: MDPosIter, +pub struct MDAlignPosIter<'a> { + align_iter: CigarMDPosIter, record: &'a bam::record::Record, } -impl<'a> RecAlignPosIter<'a> { +impl<'a> MDAlignPosIter<'a> { /// Create a new iterator for a BAM record. /// /// # Arguments /// /// * `record` is the BAM record whose alignment will be extracted + /// + /// # Errors + /// + /// An error variant is returned when `record` has no MD aux + /// field, when the aux field cannot be parsed, or when there is a + /// conflict between the MD, Cigar, and read sequence. pub fn new(record: &'a bam::record::Record) -> Result { - let align_iter = MDPosIter::new_from_record(record)?; - Ok(RecAlignPosIter { + let align_iter = CigarMDPosIter::new_from_record(record)?; + Ok(MDAlignPosIter { align_iter: align_iter, record: record, }) } - - /// Collects all alignment positions into a _read strand_ vector - /// of alignment positions. This is right to left on the reference - /// sequence, for reverse strand alignments. - #[allow(unused_must_use)] - pub fn read_strand_alignment(&mut self) -> Result>, MDAlignError> { - let mut aln: Result>, MDAlignError> = self.collect(); - - if self.record.is_reverse() { - aln.as_mut().map(|a| a.reverse()); - } - - aln - } } -impl<'a> Iterator for RecAlignPosIter<'a> { - type Item = Result, MDAlignError>; +impl<'a> Iterator for MDAlignPosIter<'a> { + type Item = Result, MDAlignError>; fn next(&mut self) -> Option { self.align_iter .next() - .map(|res| res.map(|mdpos| RecAlignPos::new(self.record, mdpos))) + .map(|res| res.map(|mdpos| MDAlignPos::new(self.record, mdpos))) } } /// Alignment position based on information in the CIGAR and MD aux -/// field for a BAM record. The `MDPos` entries for a BAM record +/// field for a BAM record. The `CigarMDPos` entries for a BAM record /// represent all information from these fields -- together with the /// sequence itself, these can fully reconstruct the read-vs-reference /// alignment. #[derive(Debug, Clone, PartialEq, Eq)] -pub enum MDPos { +pub enum CigarMDPos { Match { read_seq_pos: u32, ref_pos: u32, @@ -464,7 +260,7 @@ pub enum MDPos { } #[allow(unused_variables)] -impl MDPos { +impl CigarMDPos { /// Zero-based offset in the read sequence as reported in the BAM /// alignment. _Note_ that these positions will run from the last /// to the first base in the original input sequence for @@ -473,25 +269,25 @@ impl MDPos { /// `None is returned for read deletions. pub fn read_seq_pos(&self) -> Option { match self { - MDPos::Match { + CigarMDPos::Match { read_seq_pos, ref_pos, } => Some(*read_seq_pos), - MDPos::Mismatch { + CigarMDPos::Mismatch { ref_nt, read_seq_pos, ref_pos, } => Some(*read_seq_pos), - MDPos::Insert { + CigarMDPos::Insert { read_seq_pos, ref_pos_next, } => Some(*read_seq_pos), - MDPos::Delete { + CigarMDPos::Delete { ref_nt, read_seq_pos_next, ref_pos, } => None, - MDPos::SoftClip { read_seq_pos } => Some(*read_seq_pos), + CigarMDPos::SoftClip { read_seq_pos } => Some(*read_seq_pos), } } @@ -502,25 +298,25 @@ impl MDPos { /// original input sequence for reverse-strand alignments. pub fn read_seq_pos_or_next(&self) -> u32 { match self { - MDPos::Match { + CigarMDPos::Match { read_seq_pos, ref_pos, } => *read_seq_pos, - MDPos::Mismatch { + CigarMDPos::Mismatch { ref_nt, read_seq_pos, ref_pos, } => *read_seq_pos, - MDPos::Insert { + CigarMDPos::Insert { read_seq_pos, ref_pos_next, } => *read_seq_pos, - MDPos::Delete { + CigarMDPos::Delete { ref_nt, read_seq_pos_next, ref_pos, } => *read_seq_pos_next, - MDPos::SoftClip { read_seq_pos } => *read_seq_pos, + CigarMDPos::SoftClip { read_seq_pos } => *read_seq_pos, } } @@ -530,25 +326,25 @@ impl MDPos { /// soft clipped positions. pub fn ref_pos(&self) -> Option { match self { - MDPos::Match { + CigarMDPos::Match { read_seq_pos, ref_pos, } => Some(*ref_pos), - MDPos::Mismatch { + CigarMDPos::Mismatch { ref_nt, read_seq_pos, ref_pos, } => Some(*ref_pos), - MDPos::Insert { + CigarMDPos::Insert { read_seq_pos, ref_pos_next, } => None, - MDPos::Delete { + CigarMDPos::Delete { ref_nt, read_seq_pos_next, ref_pos, } => Some(*ref_pos), - MDPos::SoftClip { read_seq_pos } => None, + CigarMDPos::SoftClip { read_seq_pos } => None, } } @@ -559,25 +355,25 @@ impl MDPos { /// `None` is returned for soft clipped positions. pub fn ref_pos_or_next(&self) -> Option { match self { - MDPos::Match { + CigarMDPos::Match { read_seq_pos, ref_pos, } => Some(*ref_pos), - MDPos::Mismatch { + CigarMDPos::Mismatch { ref_nt, read_seq_pos, ref_pos, } => Some(*ref_pos), - MDPos::Insert { + CigarMDPos::Insert { read_seq_pos, ref_pos_next, } => Some(*ref_pos_next), - MDPos::Delete { + CigarMDPos::Delete { ref_nt, read_seq_pos_next, ref_pos, } => Some(*ref_pos), - MDPos::SoftClip { read_seq_pos } => None, + CigarMDPos::SoftClip { read_seq_pos } => None, } } @@ -587,44 +383,44 @@ impl MDPos { /// clipped positions. pub fn ref_nt(&self) -> Option { match self { - MDPos::Match { + CigarMDPos::Match { read_seq_pos, ref_pos, } => None, - MDPos::Mismatch { + CigarMDPos::Mismatch { ref_nt, read_seq_pos, ref_pos, } => Some(*ref_nt), - MDPos::Insert { + CigarMDPos::Insert { read_seq_pos, ref_pos_next, } => None, - MDPos::Delete { + CigarMDPos::Delete { ref_nt, read_seq_pos_next, ref_pos, } => Some(*ref_nt), - MDPos::SoftClip { read_seq_pos } => None, + CigarMDPos::SoftClip { read_seq_pos } => None, } } } -/// Iterator over the `MDPos` positions represented by a BAM +/// Iterator over the `CigarMDPos` positions represented by a BAM /// record. /// /// Note that these positions are generated in reference sequence /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. #[derive(Debug)] -pub struct MDPosIter { +pub struct CigarMDPosIter { md_stack: Vec, cigar_stack: Vec, ref_pos_curr: u32, read_pos_curr: u32, } -impl MDPosIter { +impl CigarMDPosIter { /// Create a new iterator for a BAM record. /// /// # Arguments @@ -637,7 +433,7 @@ impl MDPosIter { let CigarString(mut cigar_stack) = (*record.cigar()).clone(); cigar_stack.reverse(); - Ok(MDPosIter { + Ok(CigarMDPosIter { md_stack: md_stack, cigar_stack: cigar_stack, ref_pos_curr: record.pos() as u32, @@ -645,11 +441,11 @@ impl MDPosIter { }) } - // Utility function that yields the next MDPos. + // Utility function that yields the next CigarMDPos. // Requires the cigar stack is non-empty // Requires that 0-length matches and non-yielding cigar entries // are cleared from the tops of those respective stacks. - fn next_with_some(&mut self) -> Result { + fn next_with_some(&mut self) -> Result { let pop_cigar; let res = match self.cigar_stack.last_mut().unwrap() { @@ -661,7 +457,7 @@ impl MDPosIter { .ok_or_else(|| MDAlignError::MDvsCIGAR)? { MatchDesc::Matches(ref mut mdlen) => { - let pos = MDPos::Match { + let pos = CigarMDPos::Match { read_seq_pos: self.read_pos_curr, ref_pos: self.ref_pos_curr, }; @@ -674,7 +470,7 @@ impl MDPosIter { pos } MatchDesc::Mismatch(ref_nt) => { - let pos = MDPos::Mismatch { + let pos = CigarMDPos::Mismatch { ref_nt: *ref_nt, read_seq_pos: self.read_pos_curr, ref_pos: self.ref_pos_curr, @@ -703,7 +499,7 @@ impl MDPosIter { .ok_or_else(|| MDAlignError::MDvsCIGAR)? { MatchDesc::Matches(ref mut mdlen) => { - let pos = MDPos::Match { + let pos = CigarMDPos::Match { read_seq_pos: self.read_pos_curr, ref_pos: self.ref_pos_curr, }; @@ -731,7 +527,7 @@ impl MDPosIter { .ok_or_else(|| MDAlignError::MDvsCIGAR)? { MatchDesc::Mismatch(ref_nt) => { - let pos = MDPos::Mismatch { + let pos = CigarMDPos::Mismatch { ref_nt: *ref_nt, read_seq_pos: self.read_pos_curr, ref_pos: self.ref_pos_curr, @@ -751,7 +547,7 @@ impl MDPosIter { mm } Cigar::Ins(ref mut len) => { - let mut pos = MDPos::Insert { + let mut pos = CigarMDPos::Insert { read_seq_pos: self.read_pos_curr, ref_pos_next: self.ref_pos_curr, }; @@ -768,7 +564,7 @@ impl MDPosIter { if ref_nts.len() > 0 { let ref_nt = ref_nts.remove(0); pop_md = ref_nts.is_empty(); - let pos = MDPos::Delete { + let pos = CigarMDPos::Delete { ref_nt: ref_nt, ref_pos: self.ref_pos_curr, read_seq_pos_next: self.read_pos_curr, @@ -789,7 +585,7 @@ impl MDPosIter { del } Cigar::SoftClip(ref mut len) => { - let pos = MDPos::SoftClip { + let pos = CigarMDPos::SoftClip { read_seq_pos: self.read_pos_curr, }; self.read_pos_curr += 1; @@ -813,7 +609,7 @@ impl MDPosIter { /// `Alignment`. This is always a semi-global alignment that /// includes the soft clipping from the read as `Xclip` operations. pub fn collect_into_alignment(self) -> Result { - let mds_result: Result, MDAlignError> = self.collect(); + let mds_result: Result, MDAlignError> = self.collect(); let mds = mds_result?; let mut iter = mds.as_slice().into_iter(); @@ -835,7 +631,7 @@ impl MDPosIter { let (xstart, ystart) = if let Some(md) = curr { (md.read_seq_pos_or_next(), md.ref_pos_or_next().unwrap()) } else { - panic!("No MDPos remaining after left clip") + panic!("No CigarMDPos remaining after left clip") }; let mut xend = xstart; @@ -843,7 +639,7 @@ impl MDPosIter { while let Some(md) = curr { match md { - MDPos::Match { + CigarMDPos::Match { read_seq_pos, ref_pos, } => { @@ -851,7 +647,7 @@ impl MDPosIter { xend = *read_seq_pos; yend = *ref_pos; } - MDPos::Mismatch { + CigarMDPos::Mismatch { ref_nt: _, read_seq_pos, ref_pos, @@ -860,14 +656,14 @@ impl MDPosIter { xend = *read_seq_pos; yend = *ref_pos; } - MDPos::Insert { + CigarMDPos::Insert { read_seq_pos, ref_pos_next: _, } => { ops.push(AlignmentOperation::Ins); xend = *read_seq_pos; } - MDPos::Delete { + CigarMDPos::Delete { ref_nt: _, read_seq_pos_next: _, ref_pos, @@ -875,7 +671,7 @@ impl MDPosIter { ops.push(AlignmentOperation::Del); yend = *ref_pos; } - MDPos::SoftClip { read_seq_pos: _ } => break, + CigarMDPos::SoftClip { read_seq_pos: _ } => break, } curr = iter.next(); } @@ -884,7 +680,7 @@ impl MDPosIter { let mut xlen = xend; while let Some(md) = curr { match md { - MDPos::SoftClip { read_seq_pos } => { + CigarMDPos::SoftClip { read_seq_pos } => { xlen = *read_seq_pos; } _ => { @@ -910,8 +706,8 @@ impl MDPosIter { } } -impl Iterator for MDPosIter { - type Item = Result; +impl Iterator for CigarMDPosIter { + type Item = Result; fn next(&mut self) -> Option { // Process non-yielding cigar entries @@ -1421,19 +1217,19 @@ mod tests { let ap_ref_line: String = aps.iter().map(|ap| ap.ref_line_char()).collect(); assert_eq!(ap_ref_line, TEST_REF_LINE[i]); - let rap_iter = RecAlignPosIter::new(&rec) + let rap_iter = MDAlignPosIter::new(&rec) .ok() - .expect("Creating RecAlignPosIter"); - let rap_res: Result, MDAlignError> = rap_iter.collect(); - let raps = rap_res.ok().expect("Unable to create Vec"); + .expect("Creating MDAlignPosIter"); + let rap_res: Result, MDAlignError> = rap_iter.collect(); + let raps = rap_res.ok().expect("Unable to create Vec"); let rap_read_line: String = raps.iter().map(|rap| rap.read_line_char()).collect(); assert_eq!(rap_read_line, TEST_READ_LINE[i]); let rap_ref_line: String = raps.iter().map(|rap| rap.ref_line_char()).collect(); assert_eq!(rap_ref_line, TEST_REF_LINE[i]); - let md_iter = MDPosIter::new_from_record(&rec) + let md_iter = CigarMDPosIter::new_from_record(&rec) .ok() - .expect("Creating MDPosIter"); + .expect("Creating CigarMDPosIter"); let alignment = md_iter .collect_into_alignment() .ok() From 5ece79421629142c8693625ce3c77ec14a20280b Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Mon, 24 Sep 2018 10:59:35 -0700 Subject: [PATCH 07/16] Reference sequence reconstruction method on Record --- src/bam/md_align.rs | 10 ---------- src/bam/record.rs | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index d2841fedc..c1d1a748e 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1207,16 +1207,6 @@ mod tests { assert_eq!(rec.aux_md().unwrap(), TEST_MD[i]); assert_eq!(rec.cigar().to_string(), TEST_CIGAR[i]); - let ap_iter = AlignPosIter::new_from_record(&rec) - .ok() - .expect("Creating AlignPosIter"); - let ap_res: Result, MDAlignError> = ap_iter.collect(); - let aps = ap_res.ok().expect("Unable to create Vec"); - let ap_read_line: String = aps.iter().map(|ap| ap.read_line_char()).collect(); - assert_eq!(ap_read_line, TEST_READ_LINE[i]); - let ap_ref_line: String = aps.iter().map(|ap| ap.ref_line_char()).collect(); - assert_eq!(ap_ref_line, TEST_REF_LINE[i]); - let rap_iter = MDAlignPosIter::new(&rec) .ok() .expect("Creating MDAlignPosIter"); diff --git a/src/bam/record.rs b/src/bam/record.rs index 905766cf5..93c5a5de4 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -16,6 +16,7 @@ use itertools::Itertools; use regex::Regex; use bam::{AuxWriteError, HeaderView, ReadError}; +use bam::md_align::{MDAlignError, MDAlignPos, MDAlignPosIter}; use htslib; use utils; @@ -600,6 +601,20 @@ impl Record { _ => None, }) } + + /// Reconstruct reference sequence from Cigar and MD information + /// along with the read sequence. + /// + /// # Errors + /// + /// An error variant is returned when the record has no MD aux + /// field, when the MD aux field is malformed, or when there are + /// inconsistencies between the Cigar string, the MD field, and + /// the read sequence. + pub fn reference_md_seq(&self) -> Result, MDAlignError> { + let md_align: Result, MDAlignError> = MDAlignPosIter::new(&self)?.collect(); + Ok( md_align?.iter().filter_map(|pos| pos.ref_nt()).collect() ) + } } impl Drop for Record { From 1a0ee99968961b75c9e830504d1b88054705ce59 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Thu, 27 Sep 2018 20:03:22 -0700 Subject: [PATCH 08/16] Better MDString newtype using standard traits --- src/bam/md_align.rs | 215 +++++++++++++++++++------------------------- 1 file changed, 90 insertions(+), 125 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index c1d1a748e..951b86a30 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::fmt::Write; use std::str; @@ -427,7 +428,7 @@ impl CigarMDPosIter { /// /// * `record` is the BAM record whose alignment will be extracted pub fn new_from_record(record: &bam::record::Record) -> Result { - let mut md_stack = MatchDesc::parse_from_record(record)?; + let mut md_stack = MDString::new_from_record(record)?.0; md_stack.reverse(); let CigarString(mut cigar_stack) = (*record.cigar()).clone(); @@ -821,8 +822,13 @@ impl MatchDesc { _ => false, } } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MDString(pub Vec); - /// Create a `Vec` of `MatchDesc` entries corresponding to the MD aux field on a record. +impl MDString { + /// Create an `MDString` corresponding to the MD aux field on a record. /// /// # Arguments /// @@ -832,11 +838,11 @@ impl MatchDesc { /// /// If `record` has no MD aux field, or if the value of the aux field is malformed, /// an error variant will be returned. - pub fn parse_from_record(record: &bam::record::Record) -> Result, MDAlignError> { - Self::parse(record.aux_md().ok_or_else(|| MDAlignError::NoMD)?) + pub fn new_from_record(record: &bam::record::Record) -> Result { + Self::new(record.aux_md().ok_or_else(|| MDAlignError::NoMD)?) } - - /// Parses a bytestring as an MD aux field description. + + /// Create an `MDString` by parseing a bytestring as an MD aux field description. /// /// # Arguments /// @@ -849,183 +855,142 @@ impl MatchDesc { /// # Examples /// /// ``` - /// use rust_htslib::bam::md_align::{MatchDesc,MDAlignError}; + /// use rust_htslib::bam::md_align::{MatchDesc,MDString,MDAlignError}; /// # fn try_main() -> Result<(), MDAlignError> { - /// assert_eq!(MatchDesc::parse(b"10A5^AC6")?, - /// [MatchDesc::Matches(10), + /// assert_eq!(MDString::new(b"10A5^AC6")?, + /// MDString(vec![MatchDesc::Matches(10), /// MatchDesc::Mismatch(b'A'), /// MatchDesc::Matches(5), /// MatchDesc::Deletion(b"AC".to_vec()), - /// MatchDesc::Matches(6)]); + /// MatchDesc::Matches(6)])); /// # Ok( () ) /// # } /// # fn main() { try_main().unwrap(); } /// ``` - pub fn parse(mdstring: &[u8]) -> Result, MDAlignError> { - MatchDescIter::new(mdstring).collect() + // This can fail, to it can't be From<&[u8]>. Consider TryFrom<&[u8]> when stabilized + pub fn new(mdstring: &[u8]) -> Result { + let mut mdvec = Vec::new(); + + let mut mdrest = mdstring; + + while !mdrest.is_empty() { + let ch = mdrest.first().unwrap(); + if ch.is_ascii_digit() { + let endpos = mdrest + .iter() + .position(|&c| !c.is_ascii_digit()) + .unwrap_or(mdrest.len()); + let numstr = str::from_utf8(&mdrest[0..endpos])?; + mdrest = &mdrest[endpos..]; + mdvec.push(MatchDesc::Matches(numstr.parse()?)) + } else if *ch == b'^' { + let endpos = mdrest + .iter() + .skip(1) + .position(|&c| !c.is_ascii_uppercase()) + .unwrap_or(mdrest.len() - 1) + 1; + mdvec.push(MatchDesc::Deletion(mdrest[1..endpos].to_vec())); + mdrest = &mdrest[endpos..]; + } else if ch.is_ascii_uppercase() { + mdrest = &mdrest[1..]; + mdvec.push(MatchDesc::Mismatch(*ch)); + } else { + return Err(MDAlignError::BadMD) + } + } + + Ok( MDString(mdvec) ) } +} +impl fmt::Display for MDString { /// Convert a vector of `MatchDesc` entries into an MD aux field /// string. The string will be partly normalized by merging /// adjacent matches (but not deletions), which guarantees - /// unambiguous parsing. + /// unambiguous parsing. Zero-length matches will be added between + /// a mismatch and a subsequent mismatch, deletion, or + /// end-of-string. This ensures that the resulting MD field string + /// matches the regexp + /// + /// `[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*` /// /// # Arguments /// /// * `mds` are a vector of `MatchDesc` entries - /// * `strict_match0` controls whether 0-length matches are added - /// whenever a mismatch or deletion is not followed immediately by a - /// run of matches. Strictly adding a 0-length match between a mismatch - /// and a subsequent mismatch, deletion, or end-of-string ensures - /// that the resulting MD field string matches the regexp - /// - /// `[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*` - /// - /// _Note_ that 0-length matches are always added when a deletion is - /// followed by a mismatch, which is required for unambiguous parsing. /// /// # Examples /// /// ``` - /// use rust_htslib::bam::md_align::{MatchDesc,MDAlignError}; + /// use rust_htslib::bam::md_align::{MatchDesc,MDString,MDAlignError}; /// # fn try_main() -> Result<(), MDAlignError> { /// let mdvec = vec![MatchDesc::Matches(10), /// MatchDesc::Mismatch(b'A'), /// MatchDesc::Matches(5), /// MatchDesc::Deletion(b"AC".to_vec()), /// MatchDesc::Matches(6)]; - /// assert_eq!(MatchDesc::unparse(&mdvec, false), "10A5^AC6"); - /// assert_eq!(MatchDesc::unparse(&mdvec, true), "10A5^AC6"); - /// let unnorm = vec![MatchDesc::Matches(10), - /// MatchDesc::Mismatch(b'A'), - /// MatchDesc::Mismatch(b'G'), - /// MatchDesc::Matches(2), - /// MatchDesc::Matches(2), - /// MatchDesc::Deletion(b"AC".to_vec()), - /// MatchDesc::Mismatch(b'T'), - /// MatchDesc::Matches(5)]; - /// assert_eq!(MatchDesc::unparse(&unnorm, false), "10AG4^AC0T5"); - /// assert_eq!(MatchDesc::unparse(&unnorm, true), "10A0G4^AC0T5"); + /// let mdstr = MDString(mdvec); + /// assert_eq!(mdstr.to_string(), "10A5^AC6"); + /// let unnorm_vec = vec![MatchDesc::Matches(10), + /// MatchDesc::Mismatch(b'A'), + /// MatchDesc::Mismatch(b'G'), + /// MatchDesc::Matches(2), + /// MatchDesc::Matches(2), + /// MatchDesc::Deletion(b"AC".to_vec()), + /// MatchDesc::Mismatch(b'T'), + /// MatchDesc::Matches(5)]; + /// let unnorm_str = MDString(unnorm_vec); + /// assert_eq!(unnorm_str.to_string(), "10A0G4^AC0T5"); /// # Ok( () ) /// # } /// # fn main() { try_main().unwrap(); } /// ``` - pub fn unparse(mds: &[MatchDesc], strict_match0: bool) -> String { - let mut mdstr = String::new(); + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { let mut match_accum = 0; - let mut md_iter = mds.iter().peekable(); + let mut md_iter = self.0.iter().peekable(); while let Some(md) = md_iter.next() { match md { MatchDesc::Matches(mlen) => { if md_iter.peek().map_or(false, |next| next.is_matches()) { match_accum += mlen; } else { - write!(&mut mdstr, "{}", match_accum + mlen).unwrap(); + write!(f, "{}", match_accum + mlen)?; match_accum = 0; } } MatchDesc::Mismatch(refnt) => { - if strict_match0 && md_iter.peek().map_or(true, |next| !next.is_matches()) { - write!(&mut mdstr, "{}0", *refnt as char).unwrap(); + if md_iter.peek().map_or(true, |next| !next.is_matches()) { + write!(f, "{}0", *refnt as char)?; } else { - write!(&mut mdstr, "{}", *refnt as char).unwrap(); + write!(f, "{}", *refnt as char)?; } } MatchDesc::Deletion(refnts) => { - if strict_match0 { - if md_iter.peek().map_or(true, |next| !next.is_matches()) { - write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); - } else { - write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); - } + if md_iter.peek().map_or(true, |next| !next.is_matches()) { + write!(f, "^{}0", str::from_utf8(refnts).unwrap())?; } else { - if md_iter.peek().map_or(false, |next| next.is_mismatch()) { - write!(&mut mdstr, "^{}0", str::from_utf8(refnts).unwrap()).unwrap(); - } else { - write!(&mut mdstr, "^{}", str::from_utf8(refnts).unwrap()).unwrap(); - } + write!(f, "^{}", str::from_utf8(refnts).unwrap())?; } } }; } - mdstr + Ok( () ) } } -/// Iterator over `MatchDesc` entries in an MD aux field -pub struct MatchDescIter<'a> { - md: &'a [u8], -} - -impl<'a> MatchDescIter<'a> { - /// Create a new iterator over an MD aux field string - /// - /// # Arguments - /// - /// * `md` is a bytestring of an MD aux field entry - pub fn new(md: &'a [u8]) -> Self { - MatchDescIter { md: md } - } - - /// Create a new iterator over the MD aux field from a BAM record - /// - /// # Arguments - /// - /// * `record` is the source of the MD aux field for the iterator - /// - /// # Errors - /// - /// If the record does not have a string-valued MD aux field, an - /// error variant is returned. - pub fn new_from_record(record: &'a bam::record::Record) -> Result { - Ok(Self::new( - record.aux_md().ok_or_else(|| MDAlignError::NoMD)?, - )) - } - - // Requires self.md is non-empty, guaranteed to yield a MatchDesc - fn next_nonempty(&mut self) -> Result { - let ch = self.md.first().unwrap(); - if ch.is_ascii_digit() { - let endpos = self - .md - .iter() - .position(|&c| !c.is_ascii_digit()) - .unwrap_or(self.md.len()); - let numstr = str::from_utf8(&self.md[0..endpos])?; - self.md = &self.md[endpos..]; - Ok(MatchDesc::Matches(numstr.parse()?)) - } else if *ch == b'^' { - let endpos = self - .md - .iter() - .skip(1) - .position(|&c| !c.is_ascii_uppercase()) - .unwrap_or(self.md.len() - 1) + 1; - let del = MatchDesc::Deletion(self.md[1..endpos].to_vec()); - self.md = &self.md[endpos..]; - Ok(del) - } else if ch.is_ascii_uppercase() { - self.md = &self.md[1..]; - Ok(MatchDesc::Mismatch(*ch)) - } else { - Err(MDAlignError::BadMD) - } - } +impl <'a> IntoIterator for &'a MDString { + type Item = &'a MatchDesc; + type IntoIter = ::std::slice::Iter<'a, MatchDesc>; + + fn into_iter(self) -> Self::IntoIter { (&self.0).into_iter() } } -impl<'a> Iterator for MatchDescIter<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - if self.md.is_empty() { - None - } else { - Some(self.next_nonempty()) - } - } +impl IntoIterator for MDString { + type Item = MatchDesc; + type IntoIter = ::std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } } quick_error! { From 32086f0a889ec76c76a28ee03bc64cc56f3a3cff Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Thu, 27 Sep 2018 20:14:36 -0700 Subject: [PATCH 09/16] Use Iter over MatchDesc rather than reified vector --- src/bam/md_align.rs | 63 ++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 951b86a30..8964eb33b 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1,6 +1,7 @@ use std::fmt; use std::fmt::Write; use std::str; +use std::vec::IntoIter; use bio_types::alignment::{Alignment, AlignmentMode, AlignmentOperation}; @@ -415,7 +416,8 @@ impl CigarMDPos { /// the first sequenced base. #[derive(Debug)] pub struct CigarMDPosIter { - md_stack: Vec, + md_iter: IntoIter, + md_curr: Option, cigar_stack: Vec, ref_pos_curr: u32, read_pos_curr: u32, @@ -428,14 +430,15 @@ impl CigarMDPosIter { /// /// * `record` is the BAM record whose alignment will be extracted pub fn new_from_record(record: &bam::record::Record) -> Result { - let mut md_stack = MDString::new_from_record(record)?.0; - md_stack.reverse(); + let mut md_iter = MDString::new_from_record(record)?.into_iter(); + let mut md_curr = md_iter.next(); let CigarString(mut cigar_stack) = (*record.cigar()).clone(); cigar_stack.reverse(); Ok(CigarMDPosIter { - md_stack: md_stack, + md_iter: md_iter, + md_curr: md_curr, cigar_stack: cigar_stack, ref_pos_curr: record.pos() as u32, read_pos_curr: 0, @@ -451,10 +454,10 @@ impl CigarMDPosIter { let res = match self.cigar_stack.last_mut().unwrap() { Cigar::Match(ref mut ciglen) => { - let pop_md; + let next_md; let mmm = match self - .md_stack - .last_mut() + .md_curr + .as_mut() .ok_or_else(|| MDAlignError::MDvsCIGAR)? { MatchDesc::Matches(ref mut mdlen) => { @@ -465,7 +468,7 @@ impl CigarMDPosIter { self.read_pos_curr += 1; self.ref_pos_curr += 1; *mdlen -= 1; - pop_md = *mdlen == 0; + next_md = *mdlen == 0; *ciglen -= 1; pop_cigar = *ciglen == 0; pos @@ -478,7 +481,7 @@ impl CigarMDPosIter { }; self.read_pos_curr += 1; self.ref_pos_curr += 1; - pop_md = true; + next_md = true; *ciglen -= 1; pop_cigar = *ciglen == 0; pos @@ -487,16 +490,16 @@ impl CigarMDPosIter { return Err(MDAlignError::MDvsCIGAR); } }; - if pop_md { - self.md_stack.pop(); + if next_md { + self.md_curr = self.md_iter.next(); } mmm } Cigar::Equal(ref mut ciglen) => { - let pop_md; + let next_md; let m = match self - .md_stack - .last_mut() + .md_curr + .as_mut() .ok_or_else(|| MDAlignError::MDvsCIGAR)? { MatchDesc::Matches(ref mut mdlen) => { @@ -508,23 +511,23 @@ impl CigarMDPosIter { self.ref_pos_curr += 1; *mdlen -= 1; - pop_md = *mdlen == 0; + next_md = *mdlen == 0; *ciglen -= 1; pop_cigar = *ciglen == 0; pos } _ => return Err(MDAlignError::MDvsCIGAR), }; - if pop_md { - self.md_stack.pop(); + if next_md { + self.md_curr = self.md_iter.next(); } m } Cigar::Diff(ref mut ciglen) => { - let pop_md; + let next_md; let mm = match self - .md_stack - .last_mut() + .md_curr + .as_mut() .ok_or_else(|| MDAlignError::MDvsCIGAR)? { MatchDesc::Mismatch(ref_nt) => { @@ -535,15 +538,15 @@ impl CigarMDPosIter { }; self.read_pos_curr += 1; self.ref_pos_curr += 1; - pop_md = true; + next_md = true; *ciglen -= 1; pop_cigar = *ciglen == 0; pos } _ => return Err(MDAlignError::MDvsCIGAR), }; - if pop_md { - self.md_stack.pop(); + if next_md { + self.md_curr = self.md_iter.next(); } mm } @@ -558,13 +561,13 @@ impl CigarMDPosIter { pos } Cigar::Del(ref mut len) => { - let pop_md; + let next_md; - let del = match self.md_stack.last_mut() { + let del = match self.md_curr.as_mut() { Some(MatchDesc::Deletion(ref mut ref_nts)) => { if ref_nts.len() > 0 { let ref_nt = ref_nts.remove(0); - pop_md = ref_nts.is_empty(); + next_md = ref_nts.is_empty(); let pos = CigarMDPos::Delete { ref_nt: ref_nt, ref_pos: self.ref_pos_curr, @@ -580,8 +583,8 @@ impl CigarMDPosIter { }; *len -= 1; pop_cigar = *len == 0; - if pop_md { - self.md_stack.pop(); + if next_md { + self.md_curr = self.md_iter.next(); } del } @@ -730,8 +733,8 @@ impl Iterator for CigarMDPosIter { } // Consume 0-length match - while self.md_stack.last() == Some(&MatchDesc::Matches(0)) { - self.md_stack.pop(); + while self.md_curr == Some(MatchDesc::Matches(0)) { + self.md_curr = self.md_iter.next(); } if self.cigar_stack.is_empty() { From 401c58c5ff2f16a3747f7f9c4d70bc824410f17d Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Thu, 27 Sep 2018 20:24:00 -0700 Subject: [PATCH 10/16] Generic over MatchDesc iterator type --- src/bam/md_align.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 8964eb33b..1a6596503 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -196,7 +196,7 @@ impl<'a> MDAlignPos<'a> { /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. pub struct MDAlignPosIter<'a> { - align_iter: CigarMDPosIter, + align_iter: CigarMDPosIter>, record: &'a bam::record::Record, } @@ -415,15 +415,15 @@ impl CigarMDPos { /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. #[derive(Debug)] -pub struct CigarMDPosIter { - md_iter: IntoIter, +pub struct CigarMDPosIter { + md_iter: I, md_curr: Option, cigar_stack: Vec, ref_pos_curr: u32, read_pos_curr: u32, } -impl CigarMDPosIter { +impl CigarMDPosIter> { /// Create a new iterator for a BAM record. /// /// # Arguments @@ -444,7 +444,9 @@ impl CigarMDPosIter { read_pos_curr: 0, }) } +} +impl > CigarMDPosIter { // Utility function that yields the next CigarMDPos. // Requires the cigar stack is non-empty // Requires that 0-length matches and non-yielding cigar entries @@ -710,7 +712,7 @@ impl CigarMDPosIter { } } -impl Iterator for CigarMDPosIter { +impl > Iterator for CigarMDPosIter { type Item = Result; fn next(&mut self) -> Option { From c419e6a522b88cf84030fbc3aa930e671d954c21 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Thu, 27 Sep 2018 21:47:46 -0700 Subject: [PATCH 11/16] Moved to iterator access to Cigar information --- src/bam/md_align.rs | 52 +++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 1a6596503..495433c75 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -196,7 +196,7 @@ impl<'a> MDAlignPos<'a> { /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. pub struct MDAlignPosIter<'a> { - align_iter: CigarMDPosIter>, + align_iter: CigarMDPosIter, IntoIter>, record: &'a bam::record::Record, } @@ -415,15 +415,16 @@ impl CigarMDPos { /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. #[derive(Debug)] -pub struct CigarMDPosIter { +pub struct CigarMDPosIter { md_iter: I, md_curr: Option, - cigar_stack: Vec, + cigar_iter: J, + cigar_curr: Option, ref_pos_curr: u32, read_pos_curr: u32, } -impl CigarMDPosIter> { +impl CigarMDPosIter, IntoIter> { /// Create a new iterator for a BAM record. /// /// # Arguments @@ -431,30 +432,31 @@ impl CigarMDPosIter> { /// * `record` is the BAM record whose alignment will be extracted pub fn new_from_record(record: &bam::record::Record) -> Result { let mut md_iter = MDString::new_from_record(record)?.into_iter(); - let mut md_curr = md_iter.next(); + let md_curr = md_iter.next(); - let CigarString(mut cigar_stack) = (*record.cigar()).clone(); - cigar_stack.reverse(); + let mut cigar_iter = (*record.cigar()).to_vec().into_iter(); + let cigar_curr = cigar_iter.next(); Ok(CigarMDPosIter { md_iter: md_iter, md_curr: md_curr, - cigar_stack: cigar_stack, + cigar_iter: cigar_iter, + cigar_curr: cigar_curr, ref_pos_curr: record.pos() as u32, read_pos_curr: 0, }) } } -impl > CigarMDPosIter { +impl , J: Iterator> CigarMDPosIter { // Utility function that yields the next CigarMDPos. // Requires the cigar stack is non-empty // Requires that 0-length matches and non-yielding cigar entries // are cleared from the tops of those respective stacks. fn next_with_some(&mut self) -> Result { - let pop_cigar; + let next_cigar; - let res = match self.cigar_stack.last_mut().unwrap() { + let res = match self.cigar_curr.as_mut().unwrap() { Cigar::Match(ref mut ciglen) => { let next_md; let mmm = match self @@ -472,7 +474,7 @@ impl > CigarMDPosIter { *mdlen -= 1; next_md = *mdlen == 0; *ciglen -= 1; - pop_cigar = *ciglen == 0; + next_cigar = *ciglen == 0; pos } MatchDesc::Mismatch(ref_nt) => { @@ -485,7 +487,7 @@ impl > CigarMDPosIter { self.ref_pos_curr += 1; next_md = true; *ciglen -= 1; - pop_cigar = *ciglen == 0; + next_cigar = *ciglen == 0; pos } _ => { @@ -515,7 +517,7 @@ impl > CigarMDPosIter { *mdlen -= 1; next_md = *mdlen == 0; *ciglen -= 1; - pop_cigar = *ciglen == 0; + next_cigar = *ciglen == 0; pos } _ => return Err(MDAlignError::MDvsCIGAR), @@ -542,7 +544,7 @@ impl > CigarMDPosIter { self.ref_pos_curr += 1; next_md = true; *ciglen -= 1; - pop_cigar = *ciglen == 0; + next_cigar = *ciglen == 0; pos } _ => return Err(MDAlignError::MDvsCIGAR), @@ -559,7 +561,7 @@ impl > CigarMDPosIter { }; self.read_pos_curr += 1; *len -= 1; - pop_cigar = *len == 0; + next_cigar = *len == 0; pos } Cigar::Del(ref mut len) => { @@ -584,7 +586,7 @@ impl > CigarMDPosIter { _ => return Err(MDAlignError::MDvsCIGAR), }; *len -= 1; - pop_cigar = *len == 0; + next_cigar = *len == 0; if next_md { self.md_curr = self.md_iter.next(); } @@ -596,7 +598,7 @@ impl > CigarMDPosIter { }; self.read_pos_curr += 1; *len -= 1; - pop_cigar = *len == 0; + next_cigar = *len == 0; pos } Cigar::RefSkip(_) => panic!("RefSkip in next_with_some"), @@ -604,8 +606,8 @@ impl > CigarMDPosIter { Cigar::Pad(_) => panic!("Pad in next_with_some"), }; - if pop_cigar { - self.cigar_stack.pop(); + if next_cigar { + self.cigar_curr = self.cigar_iter.next(); } Ok(res) @@ -712,15 +714,15 @@ impl > CigarMDPosIter { } } -impl > Iterator for CigarMDPosIter { +impl , J: Iterator> Iterator for CigarMDPosIter { type Item = Result; fn next(&mut self) -> Option { // Process non-yielding cigar entries loop { - let handled = match self.cigar_stack.last() { + let handled = match self.cigar_curr { Some(Cigar::RefSkip(len)) => { - self.ref_pos_curr += *len; + self.ref_pos_curr += len; true } Some(Cigar::HardClip(_len)) => true, @@ -728,7 +730,7 @@ impl > Iterator for CigarMDPosIter { _ => false, }; if handled { - self.cigar_stack.pop(); + self.cigar_curr = self.cigar_iter.next(); } else { break; } @@ -739,7 +741,7 @@ impl > Iterator for CigarMDPosIter { self.md_curr = self.md_iter.next(); } - if self.cigar_stack.is_empty() { + if self.cigar_curr.is_none() { None } else { Some( From 27939c6420c67281ee394db277df43751ad0c189 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Thu, 27 Sep 2018 21:48:09 -0700 Subject: [PATCH 12/16] fmt --- src/bam/md_align.rs | 30 +++++++++++++++++------------- src/bam/record.rs | 4 ++-- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 495433c75..c2ce1e2c4 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -415,7 +415,7 @@ impl CigarMDPos { /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. #[derive(Debug)] -pub struct CigarMDPosIter { +pub struct CigarMDPosIter { md_iter: I, md_curr: Option, cigar_iter: J, @@ -448,7 +448,7 @@ impl CigarMDPosIter, IntoIter> { } } -impl , J: Iterator> CigarMDPosIter { +impl, J: Iterator> CigarMDPosIter { // Utility function that yields the next CigarMDPos. // Requires the cigar stack is non-empty // Requires that 0-length matches and non-yielding cigar entries @@ -714,7 +714,7 @@ impl , J: Iterator> CigarMDPosIter, J: Iterator> Iterator for CigarMDPosIter { +impl, J: Iterator> Iterator for CigarMDPosIter { type Item = Result; fn next(&mut self) -> Option { @@ -848,7 +848,7 @@ impl MDString { pub fn new_from_record(record: &bam::record::Record) -> Result { Self::new(record.aux_md().ok_or_else(|| MDAlignError::NoMD)?) } - + /// Create an `MDString` by parseing a bytestring as an MD aux field description. /// /// # Arguments @@ -877,7 +877,7 @@ impl MDString { // This can fail, to it can't be From<&[u8]>. Consider TryFrom<&[u8]> when stabilized pub fn new(mdstring: &[u8]) -> Result { let mut mdvec = Vec::new(); - + let mut mdrest = mdstring; while !mdrest.is_empty() { @@ -902,11 +902,11 @@ impl MDString { mdrest = &mdrest[1..]; mdvec.push(MatchDesc::Mismatch(*ch)); } else { - return Err(MDAlignError::BadMD) + return Err(MDAlignError::BadMD); } } - Ok( MDString(mdvec) ) + Ok(MDString(mdvec)) } } @@ -982,22 +982,26 @@ impl fmt::Display for MDString { }; } - Ok( () ) + Ok(()) } } -impl <'a> IntoIterator for &'a MDString { +impl<'a> IntoIterator for &'a MDString { type Item = &'a MatchDesc; type IntoIter = ::std::slice::Iter<'a, MatchDesc>; - - fn into_iter(self) -> Self::IntoIter { (&self.0).into_iter() } + + fn into_iter(self) -> Self::IntoIter { + (&self.0).into_iter() + } } impl IntoIterator for MDString { type Item = MatchDesc; type IntoIter = ::std::vec::IntoIter; - - fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } } quick_error! { diff --git a/src/bam/record.rs b/src/bam/record.rs index 93c5a5de4..d2b998544 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -15,8 +15,8 @@ use std::u32; use itertools::Itertools; use regex::Regex; -use bam::{AuxWriteError, HeaderView, ReadError}; use bam::md_align::{MDAlignError, MDAlignPos, MDAlignPosIter}; +use bam::{AuxWriteError, HeaderView, ReadError}; use htslib; use utils; @@ -613,7 +613,7 @@ impl Record { /// the read sequence. pub fn reference_md_seq(&self) -> Result, MDAlignError> { let md_align: Result, MDAlignError> = MDAlignPosIter::new(&self)?.collect(); - Ok( md_align?.iter().filter_map(|pos| pos.ref_nt()).collect() ) + Ok(md_align?.iter().filter_map(|pos| pos.ref_nt()).collect()) } } From c4510e5c79d360822210bb6fdd1d933c5e5c3fcb Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 28 Sep 2018 17:12:54 -0700 Subject: [PATCH 13/16] Moved all read sequence lookup functions onto CigarMDPos and removed all other iterators & position representations. Updated record interface accordingly. --- src/bam/md_align.rs | 421 ++++++++++++++++++-------------------------- src/bam/record.rs | 8 +- 2 files changed, 175 insertions(+), 254 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index c2ce1e2c4..2c215310a 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -6,230 +6,7 @@ use std::vec::IntoIter; use bio_types::alignment::{Alignment, AlignmentMode, AlignmentOperation}; use bam; -use bam::record::{Cigar, CigarString}; - -/// Alignment position on a BAM record, reconstructed using read -/// sequence information along with the Cigar string and the MD Aux -/// field. -pub struct MDAlignPos<'a> { - record: &'a bam::record::Record, - pos: CigarMDPos, -} - -impl<'a> MDAlignPos<'a> { - fn new(record: &'a bam::record::Record, pos: CigarMDPos) -> Self { - MDAlignPos { - record: record, - pos: pos, - } - } - - fn read_nt_at(&self, at: u32) -> u8 { - self.record.seq()[at as usize] - } - - fn qual_at(&self, at: u32) -> u8 { - self.record.qual()[at as usize] - } - - /// Read nucleotide sequence as reported in the BAM - /// alignment. _Note_ that this is the complement of the read - /// sequence itself for reverse-strand alignments. - /// - /// `None` is returned for read deletions. - pub fn read_nt(&self) -> Option { - self.pos.read_seq_pos().map(|pos| self.read_nt_at(pos)) - } - - /// Reference nucleotide sequence as reported in the BAM alignment. - /// - /// `None` is returned for read insertions and - /// soft clipped positions. - pub fn ref_nt(&self) -> Option { - self.pos.ref_nt().or_else(|| { - if self.ref_pos().is_some() { - self.read_nt() - } else { - None - } - }) - } - - /// Read nucleotide quality score as reported in the BAM - /// alignment. - /// - /// `None` is returned at read deletion positions. - pub fn read_qual(&self) -> Option { - self.pos.read_seq_pos().map(|pos| self.qual_at(pos)) - } - - /// Zero-based offset in the read sequence as reported in the BAM - /// alignment. _Note_ that these positions will run from the last - /// to the first base in the original input sequence for - /// reverse-strand alignments. - /// - /// `None is returned for read deletions. - pub fn read_seq_pos(&self) -> Option { - self.pos.read_seq_pos() - } - - /// Zero-based offset in the read sequence as reported in the BAM - /// alignment. For read deletions, the offset of the next aligned - /// read sequence nucleotide is reported. _Note_ that these - /// positions will run from the last to the first base in the - /// original input sequence for reverse-strand alignments. - pub fn read_seq_pos_or_next(&self) -> u32 { - self.pos.read_seq_pos_or_next() - } - - /// Zero-based offset in the reference sequence. - /// - /// `None` is returned for read insertions and - /// soft clipped positions. - pub fn ref_pos(&self) -> Option { - self.pos.ref_pos() - } - - /// Zero-based offset in the reference sequence. For read - /// insertions, the offset of the next aligned reference sequence - /// nucleotide is reported. - /// - /// `None` is returned for soft clipped positions. - pub fn ref_pos_or_next(&self) -> Option { - self.pos.ref_pos_or_next() - } - - /// Zero-based offset for this position in the original input read - /// sequence. Position 0 is the first nucleotide from the input - /// read sequence for either forward or reverse strand alignments. - /// - /// `None` is returned for read deletions. - pub fn read_true_pos(&self) -> Option { - self.read_seq_pos().map(|seq_pos| { - if self.record.is_reverse() { - self.record.qual().len() as u32 - (1 + seq_pos) - } else { - seq_pos - } - }) - } - - /// Nucleotide in the original input read sequence, taking into - /// account the fact that reverse-strand alignments report the - /// reverse-complement of the input read sequence. - /// - /// `None` is returned for read deletions. - pub fn read_true_nt(&self) -> Option { - self.read_nt().map(|nt| { - if self.record.is_reverse() { - fast_compl(nt) - } else { - nt - } - }) - } - - /// Nucleotide for the reference sequence that is matched against - /// `read_true_nt()`. _Note_ that this is the complement of the - /// nucleotide in the reference sequence (as per `ref_nt`) for - /// reverse-strand alignments. - /// - /// `None` is returned for read insertions and soft-clipped - /// positions. - pub fn ref_true_nt(&self) -> Option { - self.ref_nt().map(|nt| { - if self.record.is_reverse() { - fast_compl(nt) - } else { - nt - } - }) - } - - /// Character for read sequence line in pretty-printed alignment - /// format. This character is the read sequence base, in - /// lower-case for soft-clipped positions, or a `-` for read - /// deletions. - pub fn read_line_char(&self) -> char { - if self.ref_pos_or_next().is_none() { - self.read_nt() - .map_or('-', |nt| nt.to_ascii_lowercase() as char) - } else { - self.read_nt().map_or('-', |nt| nt as char) - } - } - - /// Character for middle match line in pretty-printed alignment - /// format. This is a vertical bar at match positions and a space - /// otherwise. - pub fn match_line_char(&self) -> char { - let read = self.read_nt(); - if read.is_some() && read == self.ref_nt() { - '|' - } else { - ' ' - } - } - - /// Character for reference sequence line in pretty-printed - /// alignment format. This character is the reference sequence - /// base when present, a `-` for read insertions, and a space for - /// soft-clipped positions. - pub fn ref_line_char(&self) -> char { - self.ref_nt().map_or_else( - || { - if self.ref_pos_or_next().is_none() { - ' ' - } else { - '-' - } - }, - |nt| nt as char, - ) - } -} - -/// Iterator that generates `MDAlignPos` alignment positions for a BAM -/// record. -/// -/// Note that these positions are generated in reference sequence -/// order. For a reverse-strand alignment, they run from the last to -/// the first sequenced base. -pub struct MDAlignPosIter<'a> { - align_iter: CigarMDPosIter, IntoIter>, - record: &'a bam::record::Record, -} - -impl<'a> MDAlignPosIter<'a> { - /// Create a new iterator for a BAM record. - /// - /// # Arguments - /// - /// * `record` is the BAM record whose alignment will be extracted - /// - /// # Errors - /// - /// An error variant is returned when `record` has no MD aux - /// field, when the aux field cannot be parsed, or when there is a - /// conflict between the MD, Cigar, and read sequence. - pub fn new(record: &'a bam::record::Record) -> Result { - let align_iter = CigarMDPosIter::new_from_record(record)?; - Ok(MDAlignPosIter { - align_iter: align_iter, - record: record, - }) - } -} - -impl<'a> Iterator for MDAlignPosIter<'a> { - type Item = Result, MDAlignError>; - - fn next(&mut self) -> Option { - self.align_iter - .next() - .map(|res| res.map(|mdpos| MDAlignPos::new(self.record, mdpos))) - } -} +use bam::record::{Cigar}; /// Alignment position based on information in the CIGAR and MD aux /// field for a BAM record. The `CigarMDPos` entries for a BAM record @@ -379,16 +156,45 @@ impl CigarMDPos { } } - /// Reference nucleotide, _when it differs from the read nucleotide_. + /// Read nucleotide sequence as reported in the BAM + /// alignment. _Note_ that this is the complement of the read + /// sequence itself for reverse-strand alignments. + /// + /// `None` is returned for read deletions. + /// + /// # Arguments + /// + /// `record` is the associated BAM alignment + pub fn read_nt(&self, record: &bam::record::Record) -> Option { + self.read_seq_pos().map(|pos| record.seq()[pos as usize]) + } + + /// Read nucleotide quality score as reported in the BAM + /// alignment. /// - /// `None` is returned for matches, read insertions and soft - /// clipped positions. - pub fn ref_nt(&self) -> Option { + /// `None` is returned at read deletion positions. + /// + /// # Arguments + /// + /// `record` is the associated BAM alignment + pub fn read_qual(&self, record: &bam::record::Record) -> Option { + self.read_seq_pos().map(|pos| record.qual()[pos as usize]) + } + + /// Reference nucleotide sequence as reported in the BAM alignment. + /// + /// `None` is returned for read insertions and soft clipped + /// positions. + /// + /// # Arguments + /// + /// `record` is the associated BAM alignment + pub fn ref_nt(&self, record: &bam::record::Record) -> Option { match self { CigarMDPos::Match { read_seq_pos, ref_pos, - } => None, + } => self.read_nt(record), CigarMDPos::Mismatch { ref_nt, read_seq_pos, @@ -406,6 +212,95 @@ impl CigarMDPos { CigarMDPos::SoftClip { read_seq_pos } => None, } } + + /// Zero-based offset for this position in the original input read + /// sequence. Position 0 is the first nucleotide from the input + /// read sequence for either forward or reverse strand alignments. + /// + /// `None` is returned for read deletions. + pub fn read_pos_on_read(&self, record: &bam::record::Record) -> Option { + self.read_seq_pos().map(|seq_pos| { + if record.is_reverse() { + record.qual().len() as u32 - (1 + seq_pos) + } else { + seq_pos + } + }) + } + + /// Nucleotide in the original input read sequence, taking into + /// account the fact that reverse-strand alignments report the + /// reverse-complement of the input read sequence. + /// + /// `None` is returned for read deletions. + pub fn read_nt_on_read(&self, record: &bam::record::Record) -> Option { + self.read_nt(record).map(|nt| { + if record.is_reverse() { + fast_compl(nt) + } else { + nt + } + }) + } + + /// Nucleotide for the reference sequence that is matched against + /// `read_onread_nt()`. _Note_ that this is the complement of the + /// nucleotide in the reference sequence (as per `ref_nt`) for + /// reverse-strand alignments. + /// + /// `None` is returned for read insertions and soft-clipped + /// positions. + pub fn ref_nt_on_read(&self, record: &bam::record::Record) -> Option { + self.ref_nt(record).map(|nt| { + if record.is_reverse() { + fast_compl(nt) + } else { + nt + } + }) + } + + /// Character for read sequence line in pretty-printed alignment + /// format. This character is the read sequence base, in + /// lower-case for soft-clipped positions, or a `-` for read + /// deletions. + pub fn read_line_char(&self, record: &bam::record::Record) -> char { + if self.ref_pos_or_next().is_none() { + self.read_nt(record) + .map_or('-', |nt| nt.to_ascii_lowercase() as char) + } else { + self.read_nt(record).map_or('-', |nt| nt as char) + } + } + + /// Character for middle match line in pretty-printed alignment + /// format. This is a vertical bar at match positions and a space + /// otherwise. + pub fn match_line_char(&self, record: &bam::record::Record) -> char { + let read = self.read_nt(record); + if read.is_some() && read == self.ref_nt(record) { + '|' + } else { + ' ' + } + } + + /// Character for reference sequence line in pretty-printed + /// alignment format. This character is the reference sequence + /// base when present, a `-` for read insertions, and a space for + /// soft-clipped positions. + pub fn ref_line_char(&self, record: &bam::record::Record) -> char { + self.ref_nt(record).map_or_else( + || { + if self.ref_pos_or_next().is_none() { + ' ' + } else { + '-' + } + }, + |nt| nt as char, + ) + } } /// Iterator over the `CigarMDPos` positions represented by a BAM @@ -415,7 +310,7 @@ impl CigarMDPos { /// order. For a reverse-strand alignment, they run from the last to /// the first sequenced base. #[derive(Debug)] -pub struct CigarMDPosIter { +pub struct CigarMDIter { md_iter: I, md_curr: Option, cigar_iter: J, @@ -424,31 +319,57 @@ pub struct CigarMDPosIter { read_pos_curr: u32, } -impl CigarMDPosIter, IntoIter> { +impl CigarMDIter, IntoIter> { /// Create a new iterator for a BAM record. /// /// # Arguments /// /// * `record` is the BAM record whose alignment will be extracted + /// + /// # Errors An error variant is returned when `record` has no MD + /// aux field or when there's an error extracting this field. pub fn new_from_record(record: &bam::record::Record) -> Result { - let mut md_iter = MDString::new_from_record(record)?.into_iter(); + Ok( Self::new(MDString::new_from_record(record)?, + (*record.cigar()).to_vec(), + record.pos() as u32) ) + } +} + +impl , J: Iterator> CigarMDIter { + /// Create a new iterator that consumes `MatchDesc` and `Cigar` + /// entries from iterators in order to yield `CigarMDPos` entries + /// describing the alignment. + /// + /// # Arguments + /// + /// `mdstring` can be converted into an iterator over `MatchDesc` + /// entries + /// + /// `cigarstring` can be converted into an iterator over `Cigar` + /// entires + /// + /// `startpos` is the starting position of the alignment on the + /// reference sequence + pub fn new(mdstring: S, cigarstring: T, startpos: u32) -> Self + where S: IntoIterator, + T: IntoIterator + { + let mut md_iter = mdstring.into_iter(); let md_curr = md_iter.next(); - let mut cigar_iter = (*record.cigar()).to_vec().into_iter(); + let mut cigar_iter = cigarstring.into_iter(); let cigar_curr = cigar_iter.next(); - Ok(CigarMDPosIter { + CigarMDIter { md_iter: md_iter, md_curr: md_curr, cigar_iter: cigar_iter, cigar_curr: cigar_curr, - ref_pos_curr: record.pos() as u32, + ref_pos_curr: startpos, read_pos_curr: 0, - }) + } } -} -impl, J: Iterator> CigarMDPosIter { // Utility function that yields the next CigarMDPos. // Requires the cigar stack is non-empty // Requires that 0-length matches and non-yielding cigar entries @@ -714,7 +635,7 @@ impl, J: Iterator> CigarMDPosIter, J: Iterator> Iterator for CigarMDPosIter { +impl, J: Iterator> Iterator for CigarMDIter { type Item = Result; fn next(&mut self) -> Option { @@ -1183,19 +1104,19 @@ mod tests { assert_eq!(rec.aux_md().unwrap(), TEST_MD[i]); assert_eq!(rec.cigar().to_string(), TEST_CIGAR[i]); - let rap_iter = MDAlignPosIter::new(&rec) + let cigar_md_iter = CigarMDIter::new_from_record(&rec) .ok() - .expect("Creating MDAlignPosIter"); - let rap_res: Result, MDAlignError> = rap_iter.collect(); - let raps = rap_res.ok().expect("Unable to create Vec"); - let rap_read_line: String = raps.iter().map(|rap| rap.read_line_char()).collect(); - assert_eq!(rap_read_line, TEST_READ_LINE[i]); - let rap_ref_line: String = raps.iter().map(|rap| rap.ref_line_char()).collect(); - assert_eq!(rap_ref_line, TEST_REF_LINE[i]); - - let md_iter = CigarMDPosIter::new_from_record(&rec) + .expect("Creating CigarMDIter"); + let cigar_md_res: Result, MDAlignError> = cigar_md_iter.collect(); + let cigar_md_vec = cigar_md_res.ok().expect("Unable to create Vec"); + let read_line: String = cigar_md_vec.iter().map(|rap| rap.read_line_char(&rec)).collect(); + assert_eq!(read_line, TEST_READ_LINE[i]); + let ref_line: String = raps.iter().map(|rap| rap.ref_line_char(&rec)).collect(); + assert_eq!(ref_line, TEST_REF_LINE[i]); + + let md_iter = CigarMDIter::new_from_record(&rec) .ok() - .expect("Creating CigarMDPosIter"); + .expect("Creating CigarMDIter"); let alignment = md_iter .collect_into_alignment() .ok() diff --git a/src/bam/record.rs b/src/bam/record.rs index d2b998544..e2724ef88 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -15,7 +15,7 @@ use std::u32; use itertools::Itertools; use regex::Regex; -use bam::md_align::{MDAlignError, MDAlignPos, MDAlignPosIter}; +use bam::md_align::{MDAlignError, CigarMDIter, CigarMDPos}; use bam::{AuxWriteError, HeaderView, ReadError}; use htslib; use utils; @@ -611,9 +611,9 @@ impl Record { /// field, when the MD aux field is malformed, or when there are /// inconsistencies between the Cigar string, the MD field, and /// the read sequence. - pub fn reference_md_seq(&self) -> Result, MDAlignError> { - let md_align: Result, MDAlignError> = MDAlignPosIter::new(&self)?.collect(); - Ok(md_align?.iter().filter_map(|pos| pos.ref_nt()).collect()) + pub fn reference_seq_from_md(&self) -> Result, MDAlignError> { + let cigar_md: Result, MDAlignError> = CigarMDIter::new_from_record(&self)?.collect(); + Ok(cigar_md?.iter().filter_map(|pos| pos.ref_nt(&self)).collect()) } } From a6d8fcf5e6fe0b1c2a684260d82307381f523ac3 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 28 Sep 2018 17:15:22 -0700 Subject: [PATCH 14/16] fmt --- src/bam/md_align.rs | 38 ++++++++++++++++++++------------------ src/bam/record.rs | 17 +++++++++-------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index 2c215310a..ea6e2af22 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -6,7 +6,7 @@ use std::vec::IntoIter; use bio_types::alignment::{Alignment, AlignmentMode, AlignmentOperation}; use bam; -use bam::record::{Cigar}; +use bam::record::Cigar; /// Alignment position based on information in the CIGAR and MD aux /// field for a BAM record. The `CigarMDPos` entries for a BAM record @@ -329,13 +329,15 @@ impl CigarMDIter, IntoIter> { /// # Errors An error variant is returned when `record` has no MD /// aux field or when there's an error extracting this field. pub fn new_from_record(record: &bam::record::Record) -> Result { - Ok( Self::new(MDString::new_from_record(record)?, - (*record.cigar()).to_vec(), - record.pos() as u32) ) + Ok(Self::new( + MDString::new_from_record(record)?, + (*record.cigar()).to_vec(), + record.pos() as u32, + )) } } -impl , J: Iterator> CigarMDIter { +impl, J: Iterator> CigarMDIter { /// Create a new iterator that consumes `MatchDesc` and `Cigar` /// entries from iterators in order to yield `CigarMDPos` entries /// describing the alignment. @@ -350,9 +352,10 @@ impl , J: Iterator> CigarMDIter(mdstring: S, cigarstring: T, startpos: u32) -> Self - where S: IntoIterator, - T: IntoIterator + pub fn new(mdstring: S, cigarstring: T, startpos: u32) -> Self + where + S: IntoIterator, + T: IntoIterator, { let mut md_iter = mdstring.into_iter(); let md_curr = md_iter.next(); @@ -367,7 +370,7 @@ impl , J: Iterator> CigarMDIter, J: Iterator> CigarMDIter { let next_md; - let mmm = match self - .md_curr + let mmm = match self.md_curr .as_mut() .ok_or_else(|| MDAlignError::MDvsCIGAR)? { @@ -422,8 +424,7 @@ impl , J: Iterator> CigarMDIter { let next_md; - let m = match self - .md_curr + let m = match self.md_curr .as_mut() .ok_or_else(|| MDAlignError::MDvsCIGAR)? { @@ -450,8 +451,7 @@ impl , J: Iterator> CigarMDIter { let next_md; - let mm = match self - .md_curr + let mm = match self.md_curr .as_mut() .ok_or_else(|| MDAlignError::MDvsCIGAR)? { @@ -547,8 +547,7 @@ impl , J: Iterator> CigarMDIter, MDAlignError> = cigar_md_iter.collect(); let cigar_md_vec = cigar_md_res.ok().expect("Unable to create Vec"); - let read_line: String = cigar_md_vec.iter().map(|rap| rap.read_line_char(&rec)).collect(); + let read_line: String = cigar_md_vec + .iter() + .map(|rap| rap.read_line_char(&rec)) + .collect(); assert_eq!(read_line, TEST_READ_LINE[i]); let ref_line: String = raps.iter().map(|rap| rap.ref_line_char(&rec)).collect(); assert_eq!(ref_line, TEST_REF_LINE[i]); diff --git a/src/bam/record.rs b/src/bam/record.rs index e2724ef88..52a5b6d1b 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -15,7 +15,7 @@ use std::u32; use itertools::Itertools; use regex::Regex; -use bam::md_align::{MDAlignError, CigarMDIter, CigarMDPos}; +use bam::md_align::{CigarMDIter, CigarMDPos, MDAlignError}; use bam::{AuxWriteError, HeaderView, ReadError}; use htslib; use utils; @@ -612,8 +612,12 @@ impl Record { /// inconsistencies between the Cigar string, the MD field, and /// the read sequence. pub fn reference_seq_from_md(&self) -> Result, MDAlignError> { - let cigar_md: Result, MDAlignError> = CigarMDIter::new_from_record(&self)?.collect(); - Ok(cigar_md?.iter().filter_map(|pos| pos.ref_nt(&self)).collect()) + let cigar_md: Result, MDAlignError> = + CigarMDIter::new_from_record(&self)?.collect(); + Ok(cigar_md? + .iter() + .filter_map(|pos| pos.ref_nt(&self)) + .collect()) } } @@ -885,11 +889,8 @@ impl CigarString { /// Create a CigarString from given bytes. pub fn from_bytes(text: &[u8]) -> Result { - Self::from_str( - str::from_utf8(text).map_err(|_| { - CigarError::UnexpectedOperation("unable to parse as UTF8".to_owned()) - })?, - ) + Self::from_str(str::from_utf8(text) + .map_err(|_| CigarError::UnexpectedOperation("unable to parse as UTF8".to_owned()))?) } /// Create a CigarString from given str. From 1d14ba788dbd171f9dd1f0f841ed3d71104cb5bd Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 28 Sep 2018 18:06:31 -0700 Subject: [PATCH 15/16] Collecting into an Alignment now goes through FromIterator --- src/bam/md_align.rs | 148 +++++++++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 63 deletions(-) diff --git a/src/bam/md_align.rs b/src/bam/md_align.rs index ea6e2af22..97175499b 100644 --- a/src/bam/md_align.rs +++ b/src/bam/md_align.rs @@ -1,5 +1,6 @@ use std::fmt; -use std::fmt::Write; +//use std::fmt::Write; +use std::iter::FromIterator; use std::str; use std::vec::IntoIter; @@ -533,94 +534,113 @@ impl, J: Iterator> CigarMDIter Ok(res) } +} +impl FromIterator for Alignment { /// Collect the alignment positions from this record into an /// `Alignment`. This is always a semi-global alignment that /// includes the soft clipping from the read as `Xclip` operations. - pub fn collect_into_alignment(self) -> Result { - let mds_result: Result, MDAlignError> = self.collect(); - let mds = mds_result?; - let mut iter = mds.as_slice().into_iter(); + fn from_iter(into_iterator: T) -> Alignment + where + T: IntoIterator, + { + let mut iter = into_iterator.into_iter(); let mut ops = Vec::new(); let mut curr = iter.next(); let mut left_clip = 0; - while curr.ok_or_else(|| MDAlignError::EmptyAlign)? - .ref_pos_or_next() - .is_none() + while curr.as_ref() + .map_or(false, |cigar_md| cigar_md.ref_pos_or_next().is_none()) { left_clip += 1; curr = iter.next(); } ops.push(AlignmentOperation::Xclip(left_clip)); - let (xstart, ystart) = if let Some(md) = curr { + let (xstart, ystart) = if let Some(md) = curr.as_ref() { (md.read_seq_pos_or_next(), md.ref_pos_or_next().unwrap()) } else { - panic!("No CigarMDPos remaining after left clip") + return Alignment { + score: 0, + xstart: 0, + ystart: 0, + xend: 0, + yend: 0, + ylen: 0, + xlen: 0, + operations: vec![], + mode: AlignmentMode::Semiglobal, + }; }; let mut xend = xstart; let mut yend = ystart; - while let Some(md) = curr { - match md { - CigarMDPos::Match { - read_seq_pos, - ref_pos, - } => { - ops.push(AlignmentOperation::Match); - xend = *read_seq_pos; - yend = *ref_pos; - } - CigarMDPos::Mismatch { - ref_nt: _, - read_seq_pos, - ref_pos, - } => { - ops.push(AlignmentOperation::Subst); - xend = *read_seq_pos; - yend = *ref_pos; - } - CigarMDPos::Insert { - read_seq_pos, - ref_pos_next: _, - } => { - ops.push(AlignmentOperation::Ins); - xend = *read_seq_pos; - } - CigarMDPos::Delete { - ref_nt: _, - read_seq_pos_next: _, - ref_pos, - } => { - ops.push(AlignmentOperation::Del); - yend = *ref_pos; + loop { + if let Some(md) = curr.as_ref() { + match md { + CigarMDPos::Match { + read_seq_pos, + ref_pos, + } => { + ops.push(AlignmentOperation::Match); + xend = *read_seq_pos; + yend = *ref_pos; + } + CigarMDPos::Mismatch { + ref_nt: _, + read_seq_pos, + ref_pos, + } => { + ops.push(AlignmentOperation::Subst); + xend = *read_seq_pos; + yend = *ref_pos; + } + CigarMDPos::Insert { + read_seq_pos, + ref_pos_next: _, + } => { + ops.push(AlignmentOperation::Ins); + xend = *read_seq_pos; + } + CigarMDPos::Delete { + ref_nt: _, + read_seq_pos_next: _, + ref_pos, + } => { + ops.push(AlignmentOperation::Del); + yend = *ref_pos; + } + CigarMDPos::SoftClip { read_seq_pos: _ } => break, } - CigarMDPos::SoftClip { read_seq_pos: _ } => break, + } else { + break; } + curr = iter.next(); } let mut right_clip = 0; let mut xlen = xend; - while let Some(md) = curr { - match md { - CigarMDPos::SoftClip { read_seq_pos } => { - xlen = *read_seq_pos; - } - _ => { - return Err(MDAlignError::BadMD); - } - }; - right_clip += 1; + loop { + if let Some(md) = curr.as_ref() { + match md { + CigarMDPos::SoftClip { read_seq_pos } => { + xlen = *read_seq_pos; + } + _ => break, + }; + right_clip += 1; + } else { + break; + } curr = iter.next(); } ops.push(AlignmentOperation::Xclip(right_clip)); - Ok(Alignment { + Alignment { score: 0, xstart: xstart as usize, ystart: ystart as usize, @@ -630,7 +650,7 @@ impl, J: Iterator> CigarMDIter xlen: xlen as usize, operations: ops, mode: AlignmentMode::Semiglobal, - }) + } } } @@ -1110,19 +1130,21 @@ mod tests { let cigar_md_vec = cigar_md_res.ok().expect("Unable to create Vec"); let read_line: String = cigar_md_vec .iter() - .map(|rap| rap.read_line_char(&rec)) + .map(|cigar_md| cigar_md.read_line_char(&rec)) .collect(); assert_eq!(read_line, TEST_READ_LINE[i]); - let ref_line: String = raps.iter().map(|rap| rap.ref_line_char(&rec)).collect(); + let ref_line: String = cigar_md_vec + .iter() + .map(|cigar_md| cigar_md.ref_line_char(&rec)) + .collect(); assert_eq!(ref_line, TEST_REF_LINE[i]); - let md_iter = CigarMDIter::new_from_record(&rec) + let cigar_md_iter = CigarMDIter::new_from_record(&rec) .ok() .expect("Creating CigarMDIter"); - let alignment = md_iter - .collect_into_alignment() - .ok() - .expect("collecting into alignment"); + let cigar_md_res: Result, MDAlignError> = cigar_md_iter.collect(); + let cigar_md_vec = cigar_md_res.ok().expect("Unable to create Vec"); + let alignment = cigar_md_vec.into_iter().collect::(); let a_cigar = alignment.cigar(false); assert_eq!(a_cigar, TEST_ALIGN_CIGAR[i]); } From 8a5e3acb42a9759a5725e36569a337289e7e95a4 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 28 Sep 2018 18:18:02 -0700 Subject: [PATCH 16/16] Alignment reconstruction on the Record --- src/bam/record.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/bam/record.rs b/src/bam/record.rs index 52a5b6d1b..9bdcd47fc 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -619,6 +619,19 @@ impl Record { .filter_map(|pos| pos.ref_nt(&self)) .collect()) } + + /// Reconstruct alignment from Cigar and MD information on a record. + /// + /// # Errors + /// + /// An error variant is returned when the record has no MD aux + /// field, when the MD aux field is malformed, or when there are + /// inconsistencies between the Cigar string and the MD field. + pub fn alignment_from_md(&self) -> Result { + let cigar_md: Result, MDAlignError> = + CigarMDIter::new_from_record(&self)?.collect(); + Ok(cigar_md?.into_iter().collect::()) + } } impl Drop for Record {