from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
"""Module utilities and constants used throughout bamnostic
Copyright (c) 2018, Marcus D. Sherman
This code is part of the bamnostic distribution and governed by its
license. Please see the LICENSE file that should have been included
as part of this package.
Some methods are modified versions of their counterparts
within the BioPython.bgzf module. Below is the Copyright and licensing
for those parts.
Copyright (c) 2010-2015 by Peter Cock.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
@author: "Marcus D. Sherman"
@copyright: "Copyright 2018, University of Michigan, Mills Lab
@email: "mdsherman<at>betteridiot<dot>tech"
"""
import struct
from collections import OrderedDict, namedtuple
import collections
# Python 2 doesn't put abstract base classes in the same spot as Python 3
import sys
_PY_VERSION = sys.version_info
_is_pypy = '__pypy__' in sys.builtin_module_names
if _is_pypy:
import __pypy__
if _PY_VERSION[0] == 2:
from collections import Sequence
from collections import OrderedDict as dict
else:
from collections.abc import Sequence
import numbers
import warnings
import re
warnings.formatwarning = format_warnings
# pre-compiled structures to reduce iterative unpacking
unpack_int32 = struct.Struct('<i').unpack
unpack_int32L = struct.Struct('<l').unpack
# Helper class for performant named indexing of region of interests
[docs]class Roi(object):
r"""Small __slots__ class for region of interest parsing"""
__slots__ = ['__contig', 'start', 'stop', '__tid']
def __init__(self, contig=None, start=None, stop=None, tid=None):
r""" Initialize the class
Args:
contig (str): string representation of chromosome/contig of interest
start (int): starting base position of region of interest
stop (int): ending base position of region of interest
tid (int): position of reference within the BAM header
"""
self.__contig, self.start, self.stop, self.__tid = contig, start, stop, tid
@property
def contig(self):
return self.__contig
@contig.setter
def contig(self, ref_name):
self.__contig = ref_name
@property
def tid(self):
return self.__tid
@tid.setter
def tid(self, value):
self.__tid = value
def __repr__(self):
if self.tid is not None and not self.contig:
return 'Roi(tid: {}, start: {}, stop: {})'.format(self.tid, self.start, self.stop)
elif self.contig and self.tid is None:
return 'Roi(contig: {}, start: {}, stop: {})'.format(self.contig, self.start, self.stop)
else:
return 'Roi(tid: {}, contig: {}, start: {}, stop: {})'.format(self.tid, self.contig, self.start, self.stop)
def __str__(self):
return self.__repr__()
[docs]def flag_decode(flag_code):
r"""Simple read alignment flag decoder
Every read within a BAM file ought to have an associated flag code. Theses
flags are used for read filtering and QC. The flags are described below.
Additionally, they can be found `here <https://samtools.github.io/hts-specs/SAMv1.pdf>`_
Any given read's flag is determined by the *or* (`|`) operand of all appropriate bit flags.
Args:
flag_code (int): either a standalone integer/bit flag or the read object itself
Returns:
(:obj:`list` of :obj:`tuple`): list of flag and flag description tuples.
Raises:
ValueError: if provided flag is not a valid entry
Example:
If a flag is 516 it is comprised of flag 4 and flag 512
>>> flag_decode(516)
[(4, 'read unmapped'), (512, 'QC fail')]
==== ===== ==================================================================
Int Bit Description
==== ===== ==================================================================
1 0x1 Template having multiple segments in sequencing
2 0x2 Each segment properly aligned according to the aligner
4 0x4 Segment unmapped
8 0x8 Next segment in the template unmapped
16 0x10 SEQ being reverse complemented
32 0x20 SEQ of the next segment in the template being reverse complemented
64 0x40 The first segment in the template
128 0x80 The last segment in the template
256 0x100 Secondary alignment
512 0x200 Not passing filters, such as platform/vendor quality controls
1024 0x400 PCR or optical duplicate
2048 0x800 Supplementary alignment
==== ===== ==================================================================
"""
flags = {0x1: 'read paired', 0x2: 'read mapped in proper pair',
0x4: 'read unmapped', 0x8: 'mate unmapped',
0x10: 'read reverse strand', 0x20: 'mate reverse strand',
0x40: 'first in pair', 0x80: 'second in pair',
0x100: 'secondary alignment', 0x200: 'QC fail',
0x400: 'PCR or optical duplicate', 0x800: 'supplementary alignment'}
if isinstance(flag_code, numbers.Integral):
code = flag_code
else:
code = flag_code.flag
if not isinstance(code, numbers.Integral):
raise ValueError('Provided flag is not a valid entry')
return sorted([(key, flags[key]) for key in flags if key & code])
[docs]def yes_no(message):
""" Simple prompt parser"""
yes = set(['yes', 'ye', 'y', ''])
no = set(['no', 'n'])
print(message)
while True:
answer = input('Would you like to continue? [y/n] ').lower()
if answer in yes:
return True
elif answer in no:
return False
else:
print('Please answer "Yes" or "No"')
def filter_read(read, read_callback='all'):
# Accept all reads
if read_callback == 'nofilter':
return True
# check the read flags against filter criteria
elif read_callback == 'all':
return not read.flag & 0x704 # hex for filter criteria flag bits
# custom filter
elif callable(read_callback):
return read_callback(read)
else:
raise RuntimeError('read_callback should be "all", "nofilter", or a custom function that returns a boolean')
def _parse_sam_region(region):
""" Splits and casts SAM-formatted regions"""
sam_region = ':'.join(region.split()).replace('-', ':').split(':')
# convert start and stop to integers
for i, arg in enumerate(sam_region[1:]):
sam_region[i + 1] = int(arg)
return sam_region
def _handle_split_region(split_roi, until_eof=False):
""" Checks format against `until_eof` and creates the Roi object
Args:
split_roi (:py:obj:`list` or :py::obj:`tuple`): the contig, start, and stop information.
until_eof (bool): whether or not to allow access to end of reference or file (whichever is first)
Returns:
(:py:obj:`bamnostic.utils.Roi`): region of interest formatted as an object with named attributes.
Raises:
ValueError: if `until_eof` is not set and region is open-ended or improper region format.
"""
split_roi = list(split_roi)
# make sure the user didn't put multiple positional arguments
if 1 <= len(split_roi) <= 3:
# if the user gives an integer description of chromosome, convert to string
if isinstance(split_roi[0], (str, int)):
split_roi[0] = str(split_roi[0])
if None in split_roi[1:]:
# make sure the user wants to continue if they have used an open-ended region
if not until_eof:
raise ValueError('Open-ended region while `until_eof` is set to False')
return Roi(*split_roi)
else:
raise ValueError('improper region format')
[docs]def parse_region(contig=None, start=None, stop=None, region=None,
tid=None, reference=None, end=None, until_eof=False):
r""" Parses region information from all user set parameters.
The main goal of this function is to handle the many different ways a user
can put in genomic region data. One way is through keyword arguments. This
is the most straight forward. However, due to Pysam's API, there are multiple
synonyms for reference/contig or stop/end. Additionally, if the user knows
the refID/TID of the reference they are interested in, they can input that way
as well.
The second form a submission can take is through positional arguments. Just like
keyword, but ordered such that it makes up a genomic region of interest.
Note:
Positional arguments make utilizing the `tid` parameter difficult since it is
the 5th argument of the function signature.
The third form a submission can take is through using a SAM-formatted string. An
example SAM-formatted string looks like 'chr1:10-100'. As many users also copy &
paste directly from tab-delimited files, such as BED files, a SAM-formatted string
can take the form of 'chr1\\t10\\t100' where '\\t' indicates a tab space.
Lastly, the `until_eof` switch allows users to take all items from their desired
start position (be it the whole reference or a specific spot on the reference). Setting
this to `True` (default: `False`) will pull all reads to the end of the reference or file,
whichever is first.
Args:
contig (str): name of reference/contig
start (int): start position of region of interest (0-based)
stop (int): stop position of region of interest (0-based)
region (str): SAM region formatted string. Accepts tab-delimited values as well
tid (int): the refID or target id of a reference/contig
until_eof (bool): iterate until end of file (default: False)
Returns:
query (:py:class:`bamnostic.utils.Roi`): region of interest formatted as an object with named attributes.
Raises:
ValueError: if two synonym keywords are set, but contradict each other
Examples:
.. code-block:: python
# Keyword-based
>>> parse_region(contig = 'chr1', start = 10, stop = 100)
Roi(contig: chr1, start: 10, stop: 100)
# Using `tid` instead of `contig`
>>> parse_region(tid = 0, start = 10, stop = 100)
Roi(tid: 0, start: 10, stop: 100)
# Positional arguments
>>> parse_region('chr1', 10, 100)
Roi(contig: chr1, start: 10, stop: 100)
# SAM-formatted string (keyword)
>>> parse_region(region = 'chr1:10-100')
Roi(contig: chr1, start: 10, stop: 100)
# SAM-formatted string (positional)
>>> parse_region('chr1:10-100')
Roi(contig: chr1, start: 10, stop: 100)
# Tab-delimited region string
>>> parse_region('chr1\t10\t100')
Roi(contig: chr1, start: 10, stop: 100)
# Contradictory synonyms
>>> parse_region(reference='chr1', contig='chr10', start=10, stop = 100)
Traceback (most recent call last):
...
ValueError: either contig or reference must be set, not both
"""
# Check synonyms for the reference sequence
if contig and reference:
if contig != reference:
raise ValueError('either contig or reference must be set, not both')
else:
contig = contig
elif contig or reference:
contig = contig if contig else reference
# make sure the same thing isn't computed twice
if type(contig) is Roi: # class defined in bamnostic.utils
query = contig
# check for SAM-formatted regions or bed file format
elif region or (contig is not None and (':' in contig or '\t' in contig)):
roi = region if region else contig
query = _handle_split_region(_parse_sam_region(roi), until_eof=until_eof)
else:
if tid and not contig:
contig = None
if (stop and end) and (stop != end):
raise ValueError('either stop or end must be set, not both')
else:
stop = stop if stop else end
query = _handle_split_region((contig, start, stop), until_eof=until_eof)
query.tid = tid
return query
[docs]def unpack(fmt, _io, is_array=False):
"""Utility function for unpacking binary data from file object or byte
stream.
The only difference between this method and `struct.unpack` is that
`unpack` dynamically determines the size needed to read in based on
the format string. Additionally, it can process a file object or byte
stream and implement a read or slice (respectively). Mainly, this is a
quality of life function.
Args:
fmt (str): the string format of the binary data to be unpacked
_io: built-in binary format reader (default: io.BufferedRandom)
Returns:
unpacked contents from _io based on fmt string
"""
size = struct.calcsize(fmt)
try:
# if it is byte object
out = struct.unpack(fmt, _io)
except:
# if it is a file object
out = struct.unpack(fmt, _io.read(size))
if len(out) > 1 or is_array:
return out
else:
return out[0]
[docs]def make_virtual_offset(block_start_offset, within_block_offset):
"""Compute a BGZF virtual offset from block start and within block offsets.
The BAM indexing scheme records read positions using a 64 bit
'virtual offset', comprising in C terms:
block_start_offset << 16 | within_block_offset
Here block_start_offset is the file offset of the BGZF block
start (unsigned integer using up to 64-16 = 48 bits), and
within_block_offset within the (decompressed) block (unsigned
16 bit integer).
>>> make_virtual_offset(0, 0)
0
>>> make_virtual_offset(0, 1)
1
>>> make_virtual_offset(0, 2**16 - 1)
65535
>>> make_virtual_offset(0, 2**16)
Traceback (most recent call last):
...
ValueError: Require 0 <= within_block_offset < 2**16, got 65536
"""
if within_block_offset < 0 or within_block_offset >= 65536:
raise ValueError("Require 0 <= within_block_offset < 2**16, got %i" %
within_block_offset)
if block_start_offset < 0 or block_start_offset >= 281474976710656:
raise ValueError("Require 0 <= block_start_offset < 2**48, got %i" %
block_start_offset)
return (block_start_offset << 16) | within_block_offset
[docs]def split_virtual_offset(virtual_offset):
"""Divides a 64-bit BGZF virtual offset into block start & within block offsets.
>>> (100000, 0) == split_virtual_offset(6553600000)
True
>>> (100000, 10) == split_virtual_offset(6553600010)
True
"""
coffset = virtual_offset >> 16
uoffset = virtual_offset ^ (coffset << 16)
return coffset, uoffset
[docs]class LruDict(OrderedDict):
"""Simple least recently used (LRU) based dictionary that caches a given
number of items.
"""
def __init__(self, *args, **kwargs):
""" Initialize the dictionary based on collections.OrderedDict. This
is built of the basic `OrderedDict`. The major difference in instantiation
is the usage of the `max_cache` argument. This sets the dictionary size
to be used.
Args:
items (iterable): an iterable object of key/value pairs
max_cache (int): integer divisible by 2 to set max size of dictionary
"""
try:
self.max_cache = kwargs.pop('max_cache', 128)
mode = kwargs.pop('mode', 'fifo')
if mode == 'fifo':
self.mode = False
else:
self.mode = True
except AttributeError:
self.max_cache = 128
self.mode = False # FIFO
super(LruDict, self).__init__(*args, **kwargs)
self.__map = {}
if _is_pypy:
self.move_to_end = self._pypy_move_to_end
elif _PY_VERSION[:2] <= (3,2):
self.move_to_end = self._py27_move_to_end
else:
self.move_to_end = super(LruDict, self).move_to_end
self.cull()
[docs] def get(self, key):
""" Basic getter that renews LRU status upon inspection
Args:
key (str): immutable dictionary key
"""
value = OrderedDict.__getitem__(self, key)
self.move_to_end(key)
return value
def _pypy_move_to_end(self, key, last=True):
__pypy__.move_to_end(self, key, last)
def _py27_move_to_end(self, key, last=True):
"""Move an existing element to the end (or beginning if last is false).
Raise KeyError if the element does not exist.
"""
link = self._OrderedDict__map[key]
link_prev, link_next, _ = link
soft_link = link_next[0]
link_prev[1] = link_next
link_next[0] = link_prev
root = self._OrderedDict__root
if last:
last = root[0]
link[0] = last
link[1] = root
root[0] = soft_link
last[1] = link
else:
first = root[1]
link[0] = root
link[1] = first
first[0] = soft_link
root[1] = link
[docs] def update(self, others):
""" Same as a regular `dict.update`, however, since pypy's `dict.update`
doesn't go through `dict.__setitem__`, this is used to ensure it does
Args:
others (iterable): a dictionary or iterable containing key/value pairs
"""
if type(others) == dict:
for k,v in others.items():
self.__setitem__(k,v)
elif type(others) in [list, tuple]:
for k,v in others:
self.__setitem__(k,v)
else:
raise ValueError('Iteratable/dict must be in key, value pairs')
[docs] def cull(self):
""" Main utility function for pruning the LruDict
If the length of the LruDict is more than `max_cache`, it removes the LRU item
"""
if self.max_cache:
overflow = max(0, len(self) - self.max_cache)
if overflow > 0:
for _ in range(overflow):
self.popitem(last=self.mode)
def __setitem__(self, key, value):
"""Basic setter that adds new item to dictionary, and then performs cull()
to ensure max_cache has not been violated.
Args:
key (str): immutable dictionary key
value (any): any dictionary value
"""
OrderedDict.__setitem__(self, key, value)
self.cull()
# The BAM format uses byte encoding to compress alignment data. One such
# compression is how operations are stored: they are stored and an
# array of integers. These integers are mapped to their respective
# operation identifier. Below is the mapping utility.
_CIGAR_OPS = {'M': ('BAM_CMATCH', 0),
'I': ('BAM_CINS', 1),
'D': ('BAM_CDEL', 2),
'N': ('BAM_CREF_SKIP', 3),
'S': ('BAM_CSOFT_CLIP', 4),
'H': ('BAM_CHARD_CLIP', 5),
'P': ('BAM_CPAD', 6),
'=': ('BAM_CEQUAL', 7),
'X': ('BAM_CDIFF', 8),
'B': ('BAM_CBACK', 9)}
[docs]def parse_cigar(cigar_str):
"""Parses a CIGAR string and turns it into a list of tuples
Args:
cigar_str (str): the CIGAR string as shown in SAM entry
Returns:
cigar_array (list): list of tuples of CIGAR operations (by id) and number of operations
Raises:
ValueError: if CIGAR operation is invalid
Examples:
>>> parse_cigar('3M1I3M1D5M') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
[(('BAM_CMATCH', 0), 3), ..., (('BAM_CMATCH', 0), 5)]
"""
cigar_array = []
for cigar_op in re.finditer(r'(?P<n_op>\d+)(?P<op>\w)', cigar_str):
op_dict = cigar_op.groupdict()
n_ops = int(op_dict['n_op'])
op = _CIGAR_OPS.get(op_dict['op'], -1)
if op == -1:
raise ValueError('Invalid CIGAR operation ({}).'.format(op_dict['op']))
cigar_array.append((op, n_ops))
return cigar_array
[docs]def check_cigar_arg(cigar):
""" Checks to make sure CIGAR arugment is valid.
Args:
argument (str or :py:obj:`list`): CIGAR string (pre-formatted or raw)
Returns:
(:py:obj:`list`): CIGAR re-formatted as a list
Raises:
ValueError: if CIGAR is not a string or pre-formatted list
"""
if type(cigar) == str:
cigar = parse_cigar(cigar)
elif type(cigar) == list:
pass
else:
raise ValueError('CIGAR must be string or list of tuples of cigar operations (by ID) and number of operations')
return cigar
[docs]def cigar_changes(seq, cigar):
"""Recreates the reference sequence to the extent that the CIGAR string can
represent.
Args:
seq (str): aligned segment sequence
cigar (list): list of tuples of cigar operations (by id) and number of operations
Returns:
cigar_formatted_ref (str): a version of the aligned segment's reference \
sequence given the changes reflected in the cigar string
Raises:
ValueError: if CIGAR operation is invalid
Examples:
>>> cigar_changes('ACTAGAATGGCT', '3M1I3M1D5M')
'ACTGAATGGCT'
>>> cigar_changes('ACTAGAATGGCT', '3V1I3M1D5M')
Traceback (most recent call last):
...
ValueError: Invalid CIGAR operation (V).
"""
cigar = check_cigar_arg(cigar)
cigar_formatted_ref = ''
last_cigar_pos = 0
for op, n_ops in cigar:
op_id = op[1]
if op_id in {0, 7, 8}: # matches (uses both sequence match & mismatch)
cigar_formatted_ref += seq[last_cigar_pos:last_cigar_pos + n_ops]
last_cigar_pos += n_ops
elif op_id in {1, 4}: # insertion or clips
last_cigar_pos += n_ops
elif op_id == 3: # intron or large gaps
cigar_formatted_ref += 'N' * n_ops
elif op_id in {2, 5}:
pass
else:
raise ValueError('Invalid CIGAR string: {}'.format(op))
return cigar_formatted_ref
[docs]def md_changes(seq, md_tag):
"""Recreates the reference sequence of a given alignment to the extent that the
MD tag can represent.
Note:
Used in conjunction with `cigar_changes` to recreate the
complete reference sequence
Args:
seq (str): aligned segment sequence
md_tag (str): MD tag for associated sequence
Returns:
ref_seq (str): a version of the aligned segment's reference sequence given \
the changes reflected in the MD tag
Raises:
ValueError: if MD tag is None
Example:
>>> md_changes('CTTATATTGGCCTT', '3C4AT4')
'CTTCTATTATCCTT'
"""
if md_tag is None:
raise ValueError('No MD tag found or given for sequence')
ref_seq = ''
last_md_pos = 0
for mo in re.finditer(r'(?P<matches>\d+)|(?P<del>\^\w+?(?=\d))|(?P<sub>\w)', md_tag):
mo_group_dict = mo.groupdict()
if mo_group_dict['matches'] is not None:
matches = int(mo_group_dict['matches'])
ref_seq += seq[last_md_pos:last_md_pos + matches]
last_md_pos += matches
elif mo_group_dict['del'] is not None:
deletion = mo_group_dict['del']
ref_seq += deletion[1:]
elif mo_group_dict['sub'] is not None:
substitution = mo_group_dict['sub']
ref_seq += substitution
last_md_pos += 1
else:
pass
return ref_seq
[docs]def ref_gen(seq, cigar_string, md_tag):
"""Recreates the reference sequence associated with the given segment.
Uses the CIGAR string and MD tag to recreate the reference sequence associated
with the aligned segment. This is done without the need for looking up
the reference genome. Example reads, MD tags, and CIGAR strings taken from
`David Tang's Blog`_.
.. _David Tang's Blog: https://davetang.org/muse/2011/01/28/perl-and-sam/
Returns:
(str): generated reference sequence
Raises:
KeyError: if read does not contain MD tag
Examples:
# Only mismatches
>>> seq = 'CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG'
>>> cigar = '36M'
>>> md = '1A0C0C0C1T0C0T27'
>>> ref_gen(seq, cigar, md)
'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG'
# Insertions and mismatches
>>> seq = 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'
>>> cigar = '6M1I29M'
>>> md = '0C1C0C1C0T0C27'
>>> ref_gen(seq, cigar, md)
'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACAT'
# Deletion and mismatches
>>> seq = 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'
>>> cigar = '9M9D27M'
>>> md = '2G0A5^ATGATGTCA27'
>>> ref_gen(seq, cigar, md)
'AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC'
# Insertion, deletion, and mismatches
>>> seq = 'AGTGATGGGAGGATGTCTCGTCTGTGAGTTACAGCA'
>>> cigar = '2M1I7M6D26M'
>>> md = '3C3T1^GCTCAG26'
>>> ref_gen(seq, cigar, md)
'AGGCTGGTAGCTCAGGGATGTCTCGTCTGTGAGTTACAGCA'
"""
return md_changes(cigar_changes(seq, cigar_string), md_tag)
[docs]def cigar_alignment(seq=None, cigar=None, start_pos = 0, qualities=None, base_qual_thresh=0, query=False):
"""Use the CIGAR to filter out all unaligned data bases
Any clipping results in the removal of those bases. If an insertion is seen in
the CIGAR, those bases are removed from the sequence. If a deletion is seen in
the CIGAR, those bases are padded with a period ('.') symbol.
Args:
seq (str): string sequence of the aligned segment.
cigar (str): the cigar string or `cigartuple` of the aligned segment.
start_pos (int): the first aligned position of the read
qualities (:py:obj:`array.array`): base quality array from read
Yields:
(:py:obj:`tuple` of :py:obj:`str` and :py:obj:`int`): nucleotide base and index position of that base relative to reference
Example:
>>> seq = 'AGTGATGGGAGGATGTCTCGTCTGTGAGTTACAGCA'
>>> cigar = '2M1I7M6D26M'
>>> start_position = 95
>>> c_a = cigar_alignment(seq, cigar, start_position)
>>> next(c_a)
('A', 95)
"""
cigar = check_cigar_arg(cigar)
cigar_aligned = ''
algn_seg = {}
last_cigar_pos = 0
for op, n_ops in cigar:
op_id = op if type(op) is int else op[1]
if op_id == 5: # BAM_CHARD_CLIP: skip hard clip CIGAR ops
pass
elif op_id in {1, 4}: # BAM_CINS or BAM_CSOFT_CLIP: remove from sequence
if query and op_id == 1:
seg_seq = seq[last_cigar_pos:last_cigar_pos + n_ops]
if qualities is not None:
seg_qual = qualities[last_cigar_pos:last_cigar_pos + n_ops]
for index, base in enumerate(seg_seq):
if qualities is not None:
if seg_qual[index] >= base_qual_thresh:
yield base, start_pos
else:
yield base, start_pos
last_cigar_pos += n_ops
elif op_id == 3: # BAM_CREF_SKIP: intron or large gaps
start_pos += n_ops
elif op_id == 2: # BAM_CDEL: pad for deletions
start_pos += n_ops
elif op_id in {0, 7, 8}: # matches (uses both sequence match & mismatch)
seg_seq = seq[last_cigar_pos:last_cigar_pos + n_ops]
if qualities is not None:
seg_qual = qualities[last_cigar_pos:last_cigar_pos + n_ops]
for index, base in enumerate(seg_seq):
if qualities is not None:
if seg_qual[index] >= base_qual_thresh:
yield base, start_pos
else:
yield base, start_pos
start_pos += 1
last_cigar_pos += n_ops
else:
raise ValueError('Invalid CIGAR string: {}'.format(op))