Source code for samsifter.tools.filter_ref_coverage

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Identify reference accessions with uneven coverage in MALT'ed SAM files.

Comes with several methods to create optional plots of coverage and read length
distributions.

Warning
-------
Activating the plotting of these distributions for a large input dataset can
create I/O problems due to the large amounts of PNG files generated. It will
also decrease the performance of this filter considerably and should only be
used to troubleshoot filter parameters for small subsets of the data.

.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""

import sys
if not sys.version_info[0] >= 3:
    print("Error, I need python 3.x or newer")
    exit(1)

import argparse
import logging as log
import fileinput
import tempfile
from os.path import basename, splitext
import re
import matplotlib as mpl
mpl.use('Agg')       # before importing numpy, else trouble on headless server
import matplotlib.pyplot as plt
import numpy as np

# custom libraries
from samsifter.models.filter import FilterItem
from samsifter.models.parameter import (FilterParameter, FilterSwitch,
                                        FilterThreshold, FilterFilepath)
from samsifter.util.arg_sanitation import (check_pos_float,
                                           check_pos_float_max1,
                                           check_pos_int, check_sam)
from samsifter.util.filters import line_filter
from samsifter.util.papertrail import modify_header
from samsifter.version import VERSION

__version__ = VERSION

# global variables
TEXT = "Filter references by evenness of coverage"
DESC = ("Filters reference accessions exceeding a Gini Index threshold.")


[docs]def item():
    """Create item representing this tool in list and tree views.

    Returns
    -------
    FilterItem
        Item for use in item-based list and tree views.
    """
    filter_item = FilterItem(text=TEXT, desc=DESC)
    filter_item.set_command(splitext(basename(__file__))[0])

    filter_item.add_parameter(FilterThreshold(
        text="min. Gini",
        desc="minimum Gini coefficient",
        cli_name='--min_gini',
        default=0.00,
        maximum=1.00
    ))

    filter_item.add_parameter(FilterThreshold(
        text="max. Gini",
        desc="maximum Gini coefficient",
        cli_name='--max_gini',
        default=0.90,
        maximum=1.00,
        required=True,
        active=True
    ))

    filter_item.add_parameter(FilterParameter(
        text="read lengths",
        desc="analyze read length distributions",
        cli_name="--read_lengths",
        default=False
    ))

    filter_item.add_parameter(FilterParameter(
        text="covered bases only",
        desc="covered bases only, ignore uncovered bases",
        cli_name="--covered",
        default=False
    ))

    filter_item.add_parameter(FilterFilepath(
        text="statistics file",
        desc=("override filename for tab-separated statistics "
              "[${filename}.stats.csv]"),
        cli_name='--stats_file',
        default="${filename}.stats.csv"
    ))

    filter_item.add_parameter(FilterParameter(
        text="plot distribution(s)",
        desc="plot coverage and/or read length distributions",
        cli_name="--plot",
        default=False
    ))

#    filter_item.add_parameter(FilterThreshold(
#        text="LCI argument",
#        desc="fraction of mean coverage to use for LCI integration",
#        cli_name='--lci_arg',
#        default=0.50,
#        maximum=1.00
#    ))
#
#    filter_item.add_parameter(FilterThreshold(
#        text="min. LCI",
#        desc="minimum low-coverage index",
#        cli_name='--min_lci',
#        default=0.0,
#        maximum=1000.0
#    ))
#
#    filter_item.add_parameter(FilterThreshold(
#        text="max. LCI",
#        desc="maximum low coverage index",
#        cli_name='--max_lci',
#        default=1000.0,
#        maximum=1000.0
#    ))

    filter_item.add_parameter(FilterThreshold(
        text="min. coverage",
        desc="minimum average coverage",
        cli_name='--min_cov',
        default=0,
        maximum=9999,
        precision=0
    ))

    filter_item.add_parameter(FilterThreshold(
        text="max. coverage",
        desc="maximum average coverage",
        cli_name='--max_cov',
        default=9999,
        maximum=9999,
        precision=0
    ))

    filter_item.add_parameter(FilterSwitch(
        text="filter direction",
        desc="Keep or discard entries passing the filter criteria?",
        cli_name="--discard",
        default=0,
        options=["discard", "keep"]
    ))

    filter_item.add_parameter(FilterParameter(
        text="verbose",
        desc="print additional information to STDERR",
        cli_name="--verbose",
        default=True,
        active=True
    ))

    filter_item.add_parameter(FilterParameter(
        text="debug",
        desc="print debug messages to STDERR",
        cli_name="--debug",
        default=False
    ))

    filter_item.add_parameter(FilterParameter(
        text="write PG tag",
        desc="annotate the filtered SAM file by adding a PG tag to the header",
        cli_name="--modify",
        default=True,
        active=True
    ))

    return filter_item


# plotting methods


[docs]def plot_rld(ax, length_dist, min_length, max_length, read_count,
             length_dist_total):
    """Creates a bar plot of a read length distribution.

    Includes scaled expected distribution based on all reads in file.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    length_dist : array_like
        Read length distribution of subset of reads.
    min_length : int
        Minimum read length.
    max_length : int
        Maximum read length.
    read_count : int
        Total number of reads.
    length_dist_total : array_like
        Read length distribution of all reads.
    """
    ax.set_title('RLD')
    ax.bar(np.arange(length_dist.size),
           length_dist,
           align='center',
           width=0.8,
           color='gray',
           label='actual')
    ax.plot(np.arange(min_length, max_length + 1, dtype=np.int),
            # scale expected dist down
            (length_dist_total * 1.0 / read_count) * np.sum(length_dist),
            color='red',
            linestyle='dashed',
            label='expected')
    ax.axis([min_length, max_length, 0, None])
    ax.set_xlabel("read length [bp]")
    ax.set_ylabel("number of reads")


[docs]def plot_cd(ax, depth_dist, avg_depth):
    """Creates a bar plot of a coverage distribution.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    depth_dist : array_like
        Coverage depth distribution.
    avg_depth : float
        Mean coverage.
    """
    ax.set_title("CD")
    ax.bar(np.arange(depth_dist.size, dtype=np.int),
           depth_dist,
           align='center',
           width=0.8,
           color='gray')
    ax.axis([-0.5, depth_dist.size - 0.5, 0, None])
    ax.axvline(x=avg_depth,
               color='green',
               label="avg. coverage: %.2fx" % (avg_depth,))
    ax.set_xlabel("coverage")
    ax.set_ylabel("reference bases")
    ax.legend(loc=1)  # upper right


[docs]def plot_scd(ax, depth_dist, avg_depth, ref_length, max_depth):
    """Creates a bar plot of a scaled coverage distribution.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    depth_dist : array_like
        coverage depth distribution.
    avg_depth : float
        Mean coverage.
    ref_length : int
        Length of reference in nucleotides.
    max_depth : int
        Maximum coverage depth.
    """
    depth_dist = depth_dist * 1.0 / ref_length

    ax.set_title("SCD")
    ax.bar(np.arange(depth_dist.size, dtype=np.int),
           depth_dist,
           align='center',
           width=0.8,
           color='gray')
    ax.axis([-0.5, depth_dist.size - 0.5, 0, 1])
    ax.axvline(x=avg_depth,
               color='green',
               label="avg. coverage: %.2fx" % (avg_depth,))
    ax.set_xlabel("coverage")
    ax.set_ylabel("fraction of reference bases")
    ax.legend(loc=1)  # upper right


[docs]def plot_ccd(ax, depth_dist_cumsum, avg_depth):
    """Creates a bar plot of a cumulative coverage distribution.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    depth_dist_cumsum : array_like
        Cumulative coverage depth distribution.
    avg_depth : float
        Mean coverage.
    """
    ax.set_title("CCD")
    ax.bar(np.arange(depth_dist_cumsum.size, dtype=np.int),
           depth_dist_cumsum,
           align='center',
           width=0.8,
           color='gray')
    ax.axis([-0.5, depth_dist_cumsum.size - 0.5, 0, None])
#    ax.axis([None, None, 0, None])

    ax.axvline(x=avg_depth,
               color='green',
               label="avg. coverage: %.2fx" % (avg_depth,))
    ax.set_xlabel("coverage")
#    ax.set_xticks(None, None, 1.0)
    ax.set_ylabel("cumulative bases")


[docs]def plot_sccd(ax, depth_dist_cumperc, avg_depth):
    """Creates a bar plot of a scaled cumulative coverage distribution.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    depth_dist_cumperc : array_like
        Scaled cumulative coverage depth distribution.
    avg_depth : float
        Mean coverage.
    """
    ax.set_title("SCCD")
    ax.bar(np.arange(depth_dist_cumperc.size),
           depth_dist_cumperc,
           align='edge',
           width=1,
           color='gray')
    ax.axis([0, depth_dist_cumperc.size - 1, 0, None])
    ax.axvline(x=avg_depth,
               color='green',
               label="avg. coverage: %.2fx" % (avg_depth,))
    ax.set_xlabel("coverage")
    ax.set_ylabel("cumulative fraction of alignment")


[docs]def plot_nccd(ax, depth_dist_cumperc, avg_depth, max_depth, avg_depth_scaled,
              lci, color):
    """Creates a bar plot of a normalized cumulative coverage distribution.

    Includes legend stating average scaled and total depth.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    depth_dist_cumperc : array_like
        Scaled cumulative coverage depth distribution.
    avg_depth : float
        Mean coverage.
    max_depth : int
        Maximum coverage depth.
    avg_depth_scaled : float
        Scaled mean coverage.
    lci : float
        LCI parameter.
    color : str
        Color name to be used for the bar plot.
    """
    ax.set_title("NCCD")
    ax.bar(np.arange(depth_dist_cumperc.size) * 1.0 / max_depth,
           depth_dist_cumperc,
           align='edge',
           width=1.0 / (depth_dist_cumperc.size - 1),
           color=color)
    ax.axis([0, 1, 0, 1])
    ax.axvline(x=avg_depth_scaled,
               color='green',
               label="avg. coverage: %.2f [%.2fx]" % (avg_depth_scaled,
                                                      avg_depth))
    ax.axvline(x=lci * avg_depth_scaled,
               linestyle='dashed',
               color='green',
               label="%.2f x avg. cov.: %.2f [%.2fx]" % (
                   lci, lci * avg_depth_scaled, lci * avg_depth))
    ax.set_xlabel("fraction of max coverage")
    ax.set_ylabel("cumulative fraction of alignment")
    ax.legend(loc=4)  # lower right


[docs]def plot_lorenz(ax, x, y):
    """Creates a Lorenz curve plot of a coverage distribution.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    x : array_like
        Coverage bins.
    Y : array_like
        Aligned bases per coverage bin.
    """
    (gini, auc) = get_gini_auc(x, y)

    ax.set_title("Lorenz")

    # show equal distribution
    ax.plot([0, 1],
            [0, 1],
            color='green',
            label="equal distribution")

    # show actual distribution
    ax.plot(x, y, label="actual distribution")
    ax.fill(x, y, color='gray', alpha=0.2)

    ax.axis([0, 1, 0, 1])

    ax.set_xlabel("fraction of coverage bins")
    ax.set_ylabel("fraction of alignment")
    ax.text(0.05, 0.75, "Gini = %.2f" % (gini,), transform=ax.transAxes)

    ax.legend(loc=2)  # upper left


[docs]def plot_lorenz_b2b(ax, x, y):
    """Creates a Lorenz curve plot of a base2base coverage distribution.

    Parameters
    ----------
    ax : Axes
        Axes instance of current plot.
    x : array_like
        reference base positions.
    Y : array_like
        Aligned bases per reference base.
    """
    (gini, auc) = get_gini_auc(x, y)

    ax.set_title("Lorenz base2base")

    # show equal distribution
    ax.plot([0, 1],
            [0, 1],
            color='green',
            label="equal distribution")

    # show actual distribution
    ax.plot(x, y, label="actual distribution")
    ax.fill(x, y, color='gray', alpha=0.2)

    ax.axis([0, 1, 0, 1])

    ax.set_xlabel("fraction of reference bases")
    ax.set_ylabel("fraction of aligned read bases")
    ax.text(0.05, 0.75, "Gini = %.2f" % (gini,), transform=ax.transAxes)

    ax.legend(loc=2)  # upper left


# Integration methods


[docs]def integral_discrete(dist, limit):
    """Integrate discrete distribution with stepsize 1 by adding up values.

    Parameters
    ----------
    dist : array_like
        Discrete distribution with stepsize 1.
    limit : int
        Upper limit (exclusive).

    Returns
    -------
    float
        Integral of the distribution between 0 and upper limit.
    """
    return np.sum(dist[0:limit]) / 100.0


[docs]def integral_scaled(dist, limit):
    """Integrates scaled discrete distribution with arbitrary stepsize.

    Parameters
    ----------
    dist : array_like
        Scaled discrete distribution with arbitrary stepsize.
    limit : float
        Upper limit (exclusive).

    Returns
    -------
    float
        Integral of the distribution between 0 and upper limit.
    """
    area = 0

    # prevent division by zero (for cases with 100% 1x coverage)
    if dist.size == 1:
        return 1.0 * limit

    # scale limit by max depth
    if limit <= dist.size - 1:
        limit_scaled = limit * 1.0 / (dist.size - 1)
    else:
        limit_scaled = 1.0

    stepsize = 1.0 / (dist.size - 1)
    x = stepsize
    for y in np.nditer(dist):
        if limit_scaled > x:
            area += stepsize * y
        else:
            area += (limit_scaled - (x - stepsize)) * y
            break
        x += stepsize

    return area


[docs]def lorenzify(depth_dist):
    """Calculates Lorenz curve from coverage distribution.

    Parameters
    ----------
    depth_dist : array_like
        Coverage depth distribution.

    Returns
    -------
    x : array_like
        X coordinates of Lorenz curve.
    y : array_like
        Y coordinates of Lorenz curve.
    """
    reads_sum = np.sum(depth_dist)
    dd_scaled = depth_dist * 1.0 / reads_sum
    dd_sorted = np.sort(dd_scaled)
    dd_cum = np.cumsum(dd_sorted)

    # append (0|0) origin to beginning of arrays
    y = np.append(np.array([0.0]), dd_cum)
    x = np.append(np.array([0.0]),
                  (np.arange(depth_dist.size) + 1) * 1.0 / depth_dist.size)

    return (x, y)


[docs]def lorenzify_b2b(depth_dist, ignore_uncovered=True):
    """Calculate Lorenz curve from base2base coverage distribution.

    Parameters
    ----------
    depth_dist : array_like
        Coverage depth distribution.
    ignore_uncovered : bool
        Ignore bases with zero coverage, defaults to True.

    Returns
    -------
    x : array_like
        X coordinates of Lorenz curve.
    y : array_like
        Y coordinates of Lorenz curve.
    """
    ali_total = 0
    ref_total = 0

    ali_bases_list = []
    ref_bases_list = []
    for (coverage, count) in enumerate(depth_dist):
        if ignore_uncovered:
            if coverage == 0:
                continue
        ali_bases = count * coverage
        ali_bases_list.append(ali_bases)
        ref_bases_list.append(count)

        ali_total += ali_bases
        ref_total += count

    # turn lists into arrays and scale them
    b = np.array(ref_bases_list) * 1.0 / ref_total
    v = np.array(ali_bases_list) * 1.0 / ali_total

    triples = np.array([b, v, v/b])
    ttriples = np.transpose(triples)
    tttriples = np.transpose(ttriples[ttriples[:, 2].argsort()])

    x = np.cumsum(tttriples[0])
    y = np.cumsum(tttriples[1])

    # append (0|0) origin to beginning of arrays, scale both axes on the fly
    x = np.append(np.array([0]), x)
    y = np.append(np.array([0]), y)

    return (x, y)


[docs]def get_gini_auc(x, y):
    """Calculates Gini coefficient and area under Lorenz curve.

    The Gini coefficient (also known as the Gini index) is a measure of
    statistical dispersion. When applied to the distribution of aligned read
    bases per reference base an even distribution of reads across the reference
    should have a low Gini coefficient (towards 0) while an alignment with all
    reads covering the same reference region should have a high Gini
    coefficient (towards 1).

    Parameters
    ----------
    x : array_like
        X coordinates of a normalized Lorenz distribution.
    y : array_like
        Y coordinates of a normalized Lorenz distribution.

    Returns
    -------
    float
        Gini coefficient. Ranges from 0 for a perfectly even distribution to 1
        for a maximally uneven distribution.
    float
        Integral of the Lorenz curve (area under curve). Maximal value is 0.5
        if the given distribution is uniform.
    """
    # calculate area under curve
    auc = 0.0
    for (idx, value) in enumerate(x):
        # no area at origin (0|0)
        if idx == 0:
            continue
        square = (x[idx] - x[idx - 1]) * y[idx - 1]
        triangle = ((x[idx] - x[idx - 1]) * (y[idx] - y[idx - 1])) / 2.0
        auc = auc + square + triangle

    # calculate Gini coefficient (area under normalized uniform dist is 0.5)
    gini = 1.0 - (2 * auc)

    return (gini, auc)


[docs]def covered_length_from_cigar(cigar):
    """Calculates length of reference covered by read from CIGAR operations.

    Note
    ----
    * not counting padding, skipping or insertions into the reference.
    * not counting hard or soft clipped bases of the read

    Parameters
    ----------
    cigar : str
        Unmodified CIGAR string from SAM file.

    Returns
    -------
    int
        Length of the reference sequence.
    """
    regex = re.compile("(\d+)([MIDNSHP=X])")
    length = 0
    for match in regex.finditer(cigar):
        if match.group(2) in ("M", "=", "X", "D"):
            length += int(match.group(1))
    return length


[docs]def calc_avg_depth(depth_dist, ignore_uncovered=True):
    """Calculate average depth from a coverage distribution.

    Optionally ignores uncovered bases (first array element).

    Parameters
    ----------
    depth_dist : array_like
        Sorted coverage depth distribution.
    ignore_uncovered : bool, optional
        Ignore bases with zero coverage (first array element), defaults to
        True.
    """
    sum_bases = 0
    sum_cover = 0

    for (coverage, count) in enumerate(depth_dist):
        if ignore_uncovered:
            if coverage == 0:
                continue
        sum_bases += count
        sum_cover += coverage * count

    return sum_cover / (sum_bases * 1.0)


[docs]def main():
    """Executable to filter SAM files for references with uneven coverage.

    See ``--help`` for details on expected arguments. Takes input from
    either STDIN, or optional, or positional arguments. Logs messages to
    STDERR and writes processed SAM files to STDOUT.
    """
    # parse arguments
    parser = argparse.ArgumentParser(description=DESC)
    parser.add_argument('-i', '--input',
                        type=check_sam,
                        help="specify SAM file to be analysed (default: "
                        "STDIN)",
                        required=False)
    parser.add_argument('--discard',
                        type=int,
                        help="keep or discard entries passing the filter "
                             "criteria?",
                        required=False,
                        default=0)
    parser.add_argument('-v', '--verbose',
                        required=False,
                        action='store_true',
                        help='print additional information to stderr')
    parser.add_argument('-d', '--debug',
                        required=False,
                        action='store_true',
                        help='print debug messages to stderr')
    parser.add_argument('-m', '--modify',
                        required=False,
                        action='store_true',
                        help='modify header by adding PG record')
    parser.add_argument('-r', '--read_lengths',
                        required=False,
                        action='store_true',
                        help='analyze read length distributions')
    parser.add_argument('-c', '--covered',
                        required=False,
                        action='store_true',
                        help='covered bases only, ignore uncovered bases')
    parser.add_argument('-p', '--plot',
                        required=False,
                        action='store_true',
                        help='plot distribution(s)')
    parser.add_argument('-l', '--lci_arg',
                        required=False,
                        type=check_pos_float_max1,
                        help='fraction of mean coverage to use for LCI \
                        integration',
                        default=0.5)
    parser.add_argument('-s', '--stats_file',
                        required=False,
                        help="override filename for tab-separated statistics")

    acc_thresholds = parser.add_mutually_exclusive_group(required=False)
    acc_thresholds.add_argument('--min_cov',
                                type=check_pos_int,
                                help='minimum average coverage',
                                default=0)
    acc_thresholds.add_argument('--max_cov',
                                type=check_pos_int,
                                help='maximum average coverage',
                                default=sys.maxsize)
    acc_thresholds.add_argument('--min_lci',
                                type=check_pos_float,
                                help='minimum LCI',
                                default=0.0)
    acc_thresholds.add_argument('--max_lci',
                                type=check_pos_float,
                                help='maximum LCI',
                                default=1000.0)
    acc_thresholds.add_argument('--min_gini',
                                type=check_pos_float_max1,
                                help='minimum Gini coefficient',
                                default=0.0)
    acc_thresholds.add_argument('--max_gini',
                                type=check_pos_float_max1,
                                help='maximum Gini coefficient',
                                default=1.0)

    (args, remain_args) = parser.parse_known_args()

    # configure logging
    if args.verbose:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
    else:
        log.basicConfig(format="%(levelname)s: %(message)s")

#    if args.debug:
#        from pprint import pprint

    # set filename for statistics output
    if args.stats_file:
        stats_filename = args.stats_file
    elif args.input:
        stats_filename = args.input + ".stats.csv"
    else:
        # this assumes that filename was explicitly set by calling batch script
        stats_filename = "${filename}.stats.csv"

    # initialize dicts
    header = []                     # SAM header lines, unmodified
    depths = {}                     # dict of np.array of ints
    ref_lengths = {}                # dict of ints
    read_counts = {}                # dict of ints
    if args.read_lengths:
        read_lengths = {}           # dict of dict of ints
        read_lengths_total = {}      # dict of ints
    line_count = 0
    positions = {}   # dict of list for references and their line numbers
    buffer = None    # file handle used to buffer complete STDIN
    byte_offset = 0  # offset of first line after end of header
    line_offset = 0

    # parse SAM file and divide reads by reference accession
    log.info("START of filtering reads by conservation.")

    # parse SAM file
    log.info("STEP 1: parsing SAM records.")
    try:
        """
        open SAM file from either command line argument or STDIN
        use temporary file as buffer if reading from STDIN
        """
        if args.input:
            handle = open(args.input, 'r')
        else:
            handle = fileinput.input(remain_args)
            buffer = tempfile.TemporaryFile(mode='w+')

        for line_nr, line in enumerate(handle, 0):
            line_count = line_nr + 1
            if buffer is not None:
                buffer.write(line)

            # assuming that headers always precede alignments (SAMv1)
            if line.startswith("@"):
                if args.modify:
                    line_offset = line_nr + 1
                    # store header lines for later modification
                    header.append(line)
                    # increase offset until reaching end of header
                    byte_offset += len(line)

            else:
                fields = line.split("\t")
                """
                #   content     description
                ===========================
                0   QNAME
                1   FLAG
                2   RNAME
                3   POS
                4   MAPQ
                5   CIGAR
                6   RNEXT
                7   PNEXT
                8   TLEN
                9   SEQ
                10  QUAL
                ...optional fields...
                11  AS          alignment score
                12  NM          edit distance to reference
                13  ZL          reference length
                14  ZR          "raw score"
                15  ZE          "expected"
                16  ZI          percent identify (incl. indels and deaminated)
                17  MD          mismatching positions
                18  NS          PMD score
                """

                # extract accession from required RNAME field
                # (as written by MALT 0.0.3)
                rname_parts = fields[2].split("|")
                """
                #   content
                ===============
                0   gi
                1   GenBank ID
                2   ref
                3   accession
                4   tax
                5   taxonomy ID
                """

                accession = ""
                try:
                    accession = rname_parts[3]
                except IndexError:
                    accession = "unknown"
                    log.warn("no accession in line %s, assigning read to "
                             "accession 'unknown'", line_nr + 1)

                # extract reference sequence length from optional ZL field
                ref_length = 0
                try:
                    zl_parts = fields[13].split(":")
                    ref_length = int(zl_parts[2])
                except IndexError:
                    log.error("no ZL field in line %s", line_nr + 1)
                    exit(1)

                """
                read                ------------------------------
                ref     -----------------------------------------------------
                cover   00000000000011111111111111111111111111111100000000000
                            pre    |         covered              |   post
                                 start                           stop
                """

                # get 1-based start position of read from required POS field
                pre = int(fields[3]) - 1

                # obtain stop position from required CIGAR field
                covered = covered_length_from_cigar(fields[5])
                stop = pre + covered

                if accession in depths:
                    read_counts[accession] += 1
                    for position in range(pre, stop + 1):
                        if position in depths[accession]:
                            depths[accession][position] += 1
                        else:
                            depths[accession][position] = 1
                else:
                    ref_lengths[accession] = ref_length
                    read_counts[accession] = 1
                    for position in range(pre, stop + 1):
                        depths[accession] = {position: 1}

                # optional: read length distribution
                if args.read_lengths:
                    read_length = len(fields[9])

                    if read_length in read_lengths_total:
                        read_lengths_total[read_length] += 1
                    else:
                        read_lengths_total[read_length] = 1

                    if accession in read_lengths:
                        if read_length in read_lengths[accession]:
                            read_lengths[accession][read_length] += 1
                        else:
                            read_lengths[accession][read_length] = 1
                    else:
                        read_lengths[accession] = {read_length: 1}

                # remember occurence of accession and line number(s)
                try:
                    positions[accession].append(line_nr)
                except KeyError:
                    positions[accession] = [line_nr, ]
    finally:
        handle.close()
        buffer.seek(0)

    log.info("%i reads are aligned to %i references", line_count, len(depths))

    # get overall read length statistics
    if args.read_lengths:
        max_length_total = max(read_lengths_total.keys())
        min_length_total = min(read_lengths_total.keys())
        length_dist_total = np.zeros(
            (max_length_total - min_length_total + 1, ), dtype=np.int)
        for (length, count) in read_lengths_total.items():
            length_dist_total[length - min_length_total] = count

        log.info("read lengths range from %i to %i bp",
                 min_length_total, max_length_total)

    # iterate over accessions and calculate statistics for filtering
    log.info("STEP 2: determining accessions matching the filter criteria.")
    lines = []
    max_depths = []
    max_lengths = []
    with open(stats_filename, "w") as handle:
        # create statistics header
        header = "accession\treference length [bp]\treads"
        if args.read_lengths:
            header += "\tmin length [bp]\tmax length [bp]\tmean length [bp]"
        header += "\tmin coverage\tmax coverage\tmean coverage"
        if args.covered:
            header += "\tmin coverage*\tmax coverage*\tmean coverage*"
        header += "\tLCI(%.2f)\tGini" % (args.lci_arg)
        if args.covered:
            header += "\tLCI(%.2f)*\tGini*" % (args.lci_arg)
        print(header, file=handle)

        for accession in depths:
            ref_length = ref_lengths[accession]
            read_count = read_counts[accession]

            # calculate coverage statistics
            max_depth = max(depths[accession].values())
            max_depths.append(max_depth)

            # determine depth distribution
            uncovered_bases = ref_length
            depth_dist = np.zeros((max_depth + 1,), dtype=np.int)
            for (pos, depth) in depths[accession].items():
                depth_dist[depth] += 1
                uncovered_bases -= 1
            depth_dist[0] = uncovered_bases
            if args.debug:
                print(depth_dist)

            min_depth = 0
            if uncovered_bases == 0:
                min_depth = min(depths[accession].values())
            avg_depth = calc_avg_depth(depth_dist, False)
            avg_depth_scaled = avg_depth * 1.0 / max_depth

            # convert counts to percentage of total reads (normalize y)
            depth_dist_cumsum = np.cumsum(depth_dist)
            depth_dist_cumperc = depth_dist_cumsum * 1.0 / (ref_length)

            # integrate distribution as lci(0.5)
            limit = args.lci_arg * avg_depth
#            lci = integral_discrete(depth_dist_cumperc, limit)
            lci = integral_scaled(depth_dist_cumperc, limit)

#            # calculate Gini coefficient from Lorenz curve
#            (x,y) = lorenzify(depth_dist)
#            (gini, auc) = get_gini_auc(x, y)

            # alternative Lorenz curve using base to base assignments
            (u, v) = lorenzify_b2b(depth_dist, False)
            if args.debug:
                print("x (%i entries): %s" % (len(u), u))
                print("y (%i entries): %s" % (len(v), v))
            (gini_b2b, auc_b2b) = get_gini_auc(u, v)

            # TODO also calculate Theil index, see
            # https://en.wikipedia.org/wiki/Theil_Index

            # optional: coverage stats for covered bases only
            if args.covered:
                min_depth_covered = min(depths[accession].values())
                if uncovered_bases == 0:
                    min_depth_covered = min_depth
                avg_depth_covered = calc_avg_depth(depth_dist, True)
#                avg_depth_covered_scaled = avg_depth_covered * 1.0 / max_depth

                depth_dist_covered_cumsum = np.cumsum(depth_dist[1:])
                depth_dist_covered_cumperc = (depth_dist_covered_cumsum * 1.0
                                              / (ref_length - uncovered_bases))

                limit_covered = args.lci_arg * avg_depth_covered
                lci_covered = integral_scaled(depth_dist_covered_cumperc,
                                              limit_covered)

                (m, n) = lorenzify_b2b(depth_dist, True)
                if args.debug:
                    print("x (%i entries): %s" % (len(m), m))
                    print("y (%i entries): %s" % (len(n), n))
                (gini_b2b_covered, auc_b2b_covered) = get_gini_auc(m, n)

            # optional: read length stats
            if args.read_lengths:
                max_length = max(read_lengths[accession].keys())
                min_length = min(read_lengths[accession].keys())
                avg_length = np.mean(np.fromiter(
                    read_lengths[accession].keys(), dtype=np.int,
                    count=len(read_lengths[accession])))
                max_lengths.append(max_length)

                # determine length distribution
                length_dist = np.zeros((max_length + 1,), dtype=np.int)
                for (length, count) in read_lengths[accession].items():
                    length_dist[length] = count

            # compile accession statistics
            stats = "%s\t%i\t%i" % (accession, ref_length, read_count)
            if args.read_lengths:
                stats += "\t%i\t%i\t%f" % (min_length, max_length, avg_length)
            stats += "\t%i\t%i\t%f" % (min_depth, max_depth, avg_depth)
            if args.covered:
                stats += "\t%i\t%i\t%f" % (min_depth_covered, max_depth,
                                           avg_depth_covered)
            stats += "\t%f\t%f" % (lci, gini_b2b)
            if args.covered:
                stats += "\t%f\t%f" % (lci_covered, gini_b2b_covered)
            print(stats, file=handle)

            # apply filters before time-consuming plotting steps
            if ((avg_depth < args.min_cov)
                    or (avg_depth > args.max_cov)
                    or (lci < args.min_lci)
                    or (lci > args.max_lci)
                    or (gini_b2b < args.min_gini)
                    or (gini_b2b > args.max_gini)):
                continue

            color = 'blue'

            if args.plot:
                fig_cover = plt.figure(num=1,
                                       figsize=(24, 12),  # inch x inch
                                       dpi=80,            # 1 inch = 80 pixels
                                       facecolor='white',
                                       edgecolor='black')
#                x = np.arange(depth_dist_cumperc.size) * 1.0 / max_depth

                # CD = coverage distribution plot
                ax_cd = fig_cover.add_subplot(2, 4, 1)
                plot_cd(ax_cd, depth_dist, avg_depth)

                # SCD = scaled coverage distribution plot
                ax_scd = fig_cover.add_subplot(2, 4, 2)
                plot_scd(ax_scd, depth_dist, avg_depth, ref_length, max_depth)

                # SCD = scaled coverage distribution plot
                ax_scd_covered = fig_cover.add_subplot(2, 4, 3)
                plot_scd(ax_scd_covered,
                         np.append(np.array([0]), depth_dist[1:]),
                         avg_depth_covered,
                         ref_length - uncovered_bases,
                         max_depth)

#                # CCD = cumulative coverage distribution plot
#                ax_ccd = fig_cover.add_subplot(2, 4, 2)
#                plot_ccd(ax_ccd, depth_dist_cumsum, avg_depth)

#                # SCCD = scaled cumulative coverage distribution plot
#                ax_sccd = fig_cover.add_subplot(2, 4, 3)
#                plot_sccd(ax_sccd, depth_dist_cumperc, avg_depth)

                # NCCD = normalized cumulative coverage distribution
                ax_nccd = fig_cover.add_subplot(2, 4, 4)
                plot_nccd(ax_nccd, depth_dist_cumperc, avg_depth, max_depth,
                          avg_depth_scaled, args.lci_arg, color)
                ax_nccd.text(0.05, 0.95,
                             "lci(%.2f) = %.2f" % (args.lci_arg, lci),
                             transform=ax_nccd.transAxes)

#                # Lorenz curve
#                ax_lorenz = fig_cover.add_subplot(2, 4, 5)
#                plot_lorenz(ax_lorenz, x, y)

                # Lorenz curve base2base
                ax_b2b = fig_cover.add_subplot(2, 4, 6)
                plot_lorenz_b2b(ax_b2b, u, v)

                if args.covered:
                    ax_b2b_covered = fig_cover.add_subplot(2, 4, 7)
                    plot_lorenz_b2b(ax_b2b_covered, m, n)

                # save plot to png
                fig_cover.tight_layout()
                fig_cover.savefig(accession + "_coverage.png")
                plt.close(fig_cover)

                if args.read_lengths:
                    fig_readlength = plt.figure(num=2,
                                                figsize=(6, 6),  # inch x inch
                                                dpi=80,        # 1 inch = 80px
                                                facecolor='white',
                                                edgecolor='black')

                    # RLD = read length distribution
                    ax_rld = fig_readlength.add_subplot(1, 1, 1)
                    plot_rld(ax_rld, length_dist, min_length_total,
                             max_length_total, line_nr + 1, length_dist_total)

                    # save plot to png
                    fig_readlength.savefig(accession + "_readlengths.png")
                    plt.close(fig_readlength)

            log.info("accession %s: reference has a length of %i bp and %i "
                     "aligned reads", accession, ref_length, read_count)
            log.info("accession %s: coverage ranges from %ix to %ix, average "
                     "is %.3fx", accession, min_depth, max_depth, avg_depth)
            if args.covered:
                log.info("accession %s: coverage for covered bases only "
                         "ranges from %ix to %ix, average is %.3fx",
                         accession, min_depth_covered, max_depth,
                         avg_depth_covered)
            if args.read_lengths:
                log.info("accession %s: read length ranges from %i bp to %i "
                         "bp, average is %.3f bp",
                         accession, min_length, max_length, avg_length)

            # finally print accession for later filtering steps
            lines.extend(positions[accession])
            if args.debug:
                log.info("filtering accession: %s", accession)

    # re-open file if not a buffered stream
    if buffer is None:
        buffer = open(args.input, 'r')

    # skip original headers and sneak in modified headers instead
    if args.modify:
        log.info("Writing new @PG record to SAM header.")
        cline = " ".join(sys.argv[1:])
        modified_header = modify_header(
            header,
            name=splitext(basename(__file__))[0],
            version=__version__,
            command_line=cline,
            description=TEXT
        )
        extra_headers = len(modified_header) - len(header)    # should be 1
        for record in modified_header:
            print(record.rstrip(), file=sys.stdout)
        # fast forward buffer to beginning of alignments
        buffer.seek(byte_offset)

        if args.debug:
            log.info("%i new line added to header lines.", extra_headers)
    # keep or remove accessions from SAM file
    alt = ["discarding", "keeping"]
    log.info("STEP 3: %s %i unevenly covered references from SAM file.",
             alt[args.discard], len(lines))

    line_filter(lines, buffer, discard=(args.discard == 0),
                offset=line_offset)
    buffer.close()

    log.info("END of filtering references by coverage, statistics saved to "
             "file %s", stats_filename)

    exit()

if __name__ == "__main__":
    main()
Navigation

Source code for samsifter.tools.filter_ref_coverage

Quick search

Navigation