#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Identify reference accessions with uneven coverage in MALT'ed SAM files.
Comes with several methods to create optional plots of coverage and read length
distributions.
Warning
-------
Activating the plotting of these distributions for a large input dataset can
create I/O problems due to the large amounts of PNG files generated. It will
also decrease the performance of this filter considerably and should only be
used to troubleshoot filter parameters for small subsets of the data.
.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""
import sys
if not sys.version_info[0] >= 3:
print("Error, I need python 3.x or newer")
exit(1)
import argparse
import logging as log
import fileinput
import tempfile
from os.path import basename, splitext
import re
import matplotlib as mpl
mpl.use('Agg') # before importing numpy, else trouble on headless server
import matplotlib.pyplot as plt
import numpy as np
# custom libraries
from samsifter.models.filter import FilterItem
from samsifter.models.parameter import (FilterParameter, FilterSwitch,
FilterThreshold, FilterFilepath)
from samsifter.util.arg_sanitation import (check_pos_float,
check_pos_float_max1,
check_pos_int, check_sam)
from samsifter.util.filters import line_filter
from samsifter.util.papertrail import modify_header
from samsifter.version import VERSION
__version__ = VERSION
# global variables
TEXT = "Filter references by evenness of coverage"
DESC = ("Filters reference accessions exceeding a Gini Index threshold.")
[docs]def item():
"""Create item representing this tool in list and tree views.
Returns
-------
FilterItem
Item for use in item-based list and tree views.
"""
filter_item = FilterItem(text=TEXT, desc=DESC)
filter_item.set_command(splitext(basename(__file__))[0])
filter_item.add_parameter(FilterThreshold(
text="min. Gini",
desc="minimum Gini coefficient",
cli_name='--min_gini',
default=0.00,
maximum=1.00
))
filter_item.add_parameter(FilterThreshold(
text="max. Gini",
desc="maximum Gini coefficient",
cli_name='--max_gini',
default=0.90,
maximum=1.00,
required=True,
active=True
))
filter_item.add_parameter(FilterParameter(
text="read lengths",
desc="analyze read length distributions",
cli_name="--read_lengths",
default=False
))
filter_item.add_parameter(FilterParameter(
text="covered bases only",
desc="covered bases only, ignore uncovered bases",
cli_name="--covered",
default=False
))
filter_item.add_parameter(FilterFilepath(
text="statistics file",
desc=("override filename for tab-separated statistics "
"[${filename}.stats.csv]"),
cli_name='--stats_file',
default="${filename}.stats.csv"
))
filter_item.add_parameter(FilterParameter(
text="plot distribution(s)",
desc="plot coverage and/or read length distributions",
cli_name="--plot",
default=False
))
# filter_item.add_parameter(FilterThreshold(
# text="LCI argument",
# desc="fraction of mean coverage to use for LCI integration",
# cli_name='--lci_arg',
# default=0.50,
# maximum=1.00
# ))
#
# filter_item.add_parameter(FilterThreshold(
# text="min. LCI",
# desc="minimum low-coverage index",
# cli_name='--min_lci',
# default=0.0,
# maximum=1000.0
# ))
#
# filter_item.add_parameter(FilterThreshold(
# text="max. LCI",
# desc="maximum low coverage index",
# cli_name='--max_lci',
# default=1000.0,
# maximum=1000.0
# ))
filter_item.add_parameter(FilterThreshold(
text="min. coverage",
desc="minimum average coverage",
cli_name='--min_cov',
default=0,
maximum=9999,
precision=0
))
filter_item.add_parameter(FilterThreshold(
text="max. coverage",
desc="maximum average coverage",
cli_name='--max_cov',
default=9999,
maximum=9999,
precision=0
))
filter_item.add_parameter(FilterSwitch(
text="filter direction",
desc="Keep or discard entries passing the filter criteria?",
cli_name="--discard",
default=0,
options=["discard", "keep"]
))
filter_item.add_parameter(FilterParameter(
text="verbose",
desc="print additional information to STDERR",
cli_name="--verbose",
default=True,
active=True
))
filter_item.add_parameter(FilterParameter(
text="debug",
desc="print debug messages to STDERR",
cli_name="--debug",
default=False
))
filter_item.add_parameter(FilterParameter(
text="write PG tag",
desc="annotate the filtered SAM file by adding a PG tag to the header",
cli_name="--modify",
default=True,
active=True
))
return filter_item
# plotting methods
[docs]def plot_rld(ax, length_dist, min_length, max_length, read_count,
length_dist_total):
"""Creates a bar plot of a read length distribution.
Includes scaled expected distribution based on all reads in file.
Parameters
----------
ax : Axes
Axes instance of current plot.
length_dist : array_like
Read length distribution of subset of reads.
min_length : int
Minimum read length.
max_length : int
Maximum read length.
read_count : int
Total number of reads.
length_dist_total : array_like
Read length distribution of all reads.
"""
ax.set_title('RLD')
ax.bar(np.arange(length_dist.size),
length_dist,
align='center',
width=0.8,
color='gray',
label='actual')
ax.plot(np.arange(min_length, max_length + 1, dtype=np.int),
# scale expected dist down
(length_dist_total * 1.0 / read_count) * np.sum(length_dist),
color='red',
linestyle='dashed',
label='expected')
ax.axis([min_length, max_length, 0, None])
ax.set_xlabel("read length [bp]")
ax.set_ylabel("number of reads")
[docs]def plot_cd(ax, depth_dist, avg_depth):
"""Creates a bar plot of a coverage distribution.
Parameters
----------
ax : Axes
Axes instance of current plot.
depth_dist : array_like
Coverage depth distribution.
avg_depth : float
Mean coverage.
"""
ax.set_title("CD")
ax.bar(np.arange(depth_dist.size, dtype=np.int),
depth_dist,
align='center',
width=0.8,
color='gray')
ax.axis([-0.5, depth_dist.size - 0.5, 0, None])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
ax.set_ylabel("reference bases")
ax.legend(loc=1) # upper right
[docs]def plot_scd(ax, depth_dist, avg_depth, ref_length, max_depth):
"""Creates a bar plot of a scaled coverage distribution.
Parameters
----------
ax : Axes
Axes instance of current plot.
depth_dist : array_like
coverage depth distribution.
avg_depth : float
Mean coverage.
ref_length : int
Length of reference in nucleotides.
max_depth : int
Maximum coverage depth.
"""
depth_dist = depth_dist * 1.0 / ref_length
ax.set_title("SCD")
ax.bar(np.arange(depth_dist.size, dtype=np.int),
depth_dist,
align='center',
width=0.8,
color='gray')
ax.axis([-0.5, depth_dist.size - 0.5, 0, 1])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
ax.set_ylabel("fraction of reference bases")
ax.legend(loc=1) # upper right
[docs]def plot_ccd(ax, depth_dist_cumsum, avg_depth):
"""Creates a bar plot of a cumulative coverage distribution.
Parameters
----------
ax : Axes
Axes instance of current plot.
depth_dist_cumsum : array_like
Cumulative coverage depth distribution.
avg_depth : float
Mean coverage.
"""
ax.set_title("CCD")
ax.bar(np.arange(depth_dist_cumsum.size, dtype=np.int),
depth_dist_cumsum,
align='center',
width=0.8,
color='gray')
ax.axis([-0.5, depth_dist_cumsum.size - 0.5, 0, None])
# ax.axis([None, None, 0, None])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
# ax.set_xticks(None, None, 1.0)
ax.set_ylabel("cumulative bases")
[docs]def plot_sccd(ax, depth_dist_cumperc, avg_depth):
"""Creates a bar plot of a scaled cumulative coverage distribution.
Parameters
----------
ax : Axes
Axes instance of current plot.
depth_dist_cumperc : array_like
Scaled cumulative coverage depth distribution.
avg_depth : float
Mean coverage.
"""
ax.set_title("SCCD")
ax.bar(np.arange(depth_dist_cumperc.size),
depth_dist_cumperc,
align='edge',
width=1,
color='gray')
ax.axis([0, depth_dist_cumperc.size - 1, 0, None])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
ax.set_ylabel("cumulative fraction of alignment")
[docs]def plot_nccd(ax, depth_dist_cumperc, avg_depth, max_depth, avg_depth_scaled,
lci, color):
"""Creates a bar plot of a normalized cumulative coverage distribution.
Includes legend stating average scaled and total depth.
Parameters
----------
ax : Axes
Axes instance of current plot.
depth_dist_cumperc : array_like
Scaled cumulative coverage depth distribution.
avg_depth : float
Mean coverage.
max_depth : int
Maximum coverage depth.
avg_depth_scaled : float
Scaled mean coverage.
lci : float
LCI parameter.
color : str
Color name to be used for the bar plot.
"""
ax.set_title("NCCD")
ax.bar(np.arange(depth_dist_cumperc.size) * 1.0 / max_depth,
depth_dist_cumperc,
align='edge',
width=1.0 / (depth_dist_cumperc.size - 1),
color=color)
ax.axis([0, 1, 0, 1])
ax.axvline(x=avg_depth_scaled,
color='green',
label="avg. coverage: %.2f [%.2fx]" % (avg_depth_scaled,
avg_depth))
ax.axvline(x=lci * avg_depth_scaled,
linestyle='dashed',
color='green',
label="%.2f x avg. cov.: %.2f [%.2fx]" % (
lci, lci * avg_depth_scaled, lci * avg_depth))
ax.set_xlabel("fraction of max coverage")
ax.set_ylabel("cumulative fraction of alignment")
ax.legend(loc=4) # lower right
[docs]def plot_lorenz(ax, x, y):
"""Creates a Lorenz curve plot of a coverage distribution.
Parameters
----------
ax : Axes
Axes instance of current plot.
x : array_like
Coverage bins.
Y : array_like
Aligned bases per coverage bin.
"""
(gini, auc) = get_gini_auc(x, y)
ax.set_title("Lorenz")
# show equal distribution
ax.plot([0, 1],
[0, 1],
color='green',
label="equal distribution")
# show actual distribution
ax.plot(x, y, label="actual distribution")
ax.fill(x, y, color='gray', alpha=0.2)
ax.axis([0, 1, 0, 1])
ax.set_xlabel("fraction of coverage bins")
ax.set_ylabel("fraction of alignment")
ax.text(0.05, 0.75, "Gini = %.2f" % (gini,), transform=ax.transAxes)
ax.legend(loc=2) # upper left
[docs]def plot_lorenz_b2b(ax, x, y):
"""Creates a Lorenz curve plot of a base2base coverage distribution.
Parameters
----------
ax : Axes
Axes instance of current plot.
x : array_like
reference base positions.
Y : array_like
Aligned bases per reference base.
"""
(gini, auc) = get_gini_auc(x, y)
ax.set_title("Lorenz base2base")
# show equal distribution
ax.plot([0, 1],
[0, 1],
color='green',
label="equal distribution")
# show actual distribution
ax.plot(x, y, label="actual distribution")
ax.fill(x, y, color='gray', alpha=0.2)
ax.axis([0, 1, 0, 1])
ax.set_xlabel("fraction of reference bases")
ax.set_ylabel("fraction of aligned read bases")
ax.text(0.05, 0.75, "Gini = %.2f" % (gini,), transform=ax.transAxes)
ax.legend(loc=2) # upper left
# Integration methods
[docs]def integral_discrete(dist, limit):
"""Integrate discrete distribution with stepsize 1 by adding up values.
Parameters
----------
dist : array_like
Discrete distribution with stepsize 1.
limit : int
Upper limit (exclusive).
Returns
-------
float
Integral of the distribution between 0 and upper limit.
"""
return np.sum(dist[0:limit]) / 100.0
[docs]def integral_scaled(dist, limit):
"""Integrates scaled discrete distribution with arbitrary stepsize.
Parameters
----------
dist : array_like
Scaled discrete distribution with arbitrary stepsize.
limit : float
Upper limit (exclusive).
Returns
-------
float
Integral of the distribution between 0 and upper limit.
"""
area = 0
# prevent division by zero (for cases with 100% 1x coverage)
if dist.size == 1:
return 1.0 * limit
# scale limit by max depth
if limit <= dist.size - 1:
limit_scaled = limit * 1.0 / (dist.size - 1)
else:
limit_scaled = 1.0
stepsize = 1.0 / (dist.size - 1)
x = stepsize
for y in np.nditer(dist):
if limit_scaled > x:
area += stepsize * y
else:
area += (limit_scaled - (x - stepsize)) * y
break
x += stepsize
return area
[docs]def lorenzify(depth_dist):
"""Calculates Lorenz curve from coverage distribution.
Parameters
----------
depth_dist : array_like
Coverage depth distribution.
Returns
-------
x : array_like
X coordinates of Lorenz curve.
y : array_like
Y coordinates of Lorenz curve.
"""
reads_sum = np.sum(depth_dist)
dd_scaled = depth_dist * 1.0 / reads_sum
dd_sorted = np.sort(dd_scaled)
dd_cum = np.cumsum(dd_sorted)
# append (0|0) origin to beginning of arrays
y = np.append(np.array([0.0]), dd_cum)
x = np.append(np.array([0.0]),
(np.arange(depth_dist.size) + 1) * 1.0 / depth_dist.size)
return (x, y)
[docs]def lorenzify_b2b(depth_dist, ignore_uncovered=True):
"""Calculate Lorenz curve from base2base coverage distribution.
Parameters
----------
depth_dist : array_like
Coverage depth distribution.
ignore_uncovered : bool
Ignore bases with zero coverage, defaults to True.
Returns
-------
x : array_like
X coordinates of Lorenz curve.
y : array_like
Y coordinates of Lorenz curve.
"""
ali_total = 0
ref_total = 0
ali_bases_list = []
ref_bases_list = []
for (coverage, count) in enumerate(depth_dist):
if ignore_uncovered:
if coverage == 0:
continue
ali_bases = count * coverage
ali_bases_list.append(ali_bases)
ref_bases_list.append(count)
ali_total += ali_bases
ref_total += count
# turn lists into arrays and scale them
b = np.array(ref_bases_list) * 1.0 / ref_total
v = np.array(ali_bases_list) * 1.0 / ali_total
triples = np.array([b, v, v/b])
ttriples = np.transpose(triples)
tttriples = np.transpose(ttriples[ttriples[:, 2].argsort()])
x = np.cumsum(tttriples[0])
y = np.cumsum(tttriples[1])
# append (0|0) origin to beginning of arrays, scale both axes on the fly
x = np.append(np.array([0]), x)
y = np.append(np.array([0]), y)
return (x, y)
[docs]def get_gini_auc(x, y):
"""Calculates Gini coefficient and area under Lorenz curve.
The Gini coefficient (also known as the Gini index) is a measure of
statistical dispersion. When applied to the distribution of aligned read
bases per reference base an even distribution of reads across the reference
should have a low Gini coefficient (towards 0) while an alignment with all
reads covering the same reference region should have a high Gini
coefficient (towards 1).
Parameters
----------
x : array_like
X coordinates of a normalized Lorenz distribution.
y : array_like
Y coordinates of a normalized Lorenz distribution.
Returns
-------
float
Gini coefficient. Ranges from 0 for a perfectly even distribution to 1
for a maximally uneven distribution.
float
Integral of the Lorenz curve (area under curve). Maximal value is 0.5
if the given distribution is uniform.
"""
# calculate area under curve
auc = 0.0
for (idx, value) in enumerate(x):
# no area at origin (0|0)
if idx == 0:
continue
square = (x[idx] - x[idx - 1]) * y[idx - 1]
triangle = ((x[idx] - x[idx - 1]) * (y[idx] - y[idx - 1])) / 2.0
auc = auc + square + triangle
# calculate Gini coefficient (area under normalized uniform dist is 0.5)
gini = 1.0 - (2 * auc)
return (gini, auc)
[docs]def covered_length_from_cigar(cigar):
"""Calculates length of reference covered by read from CIGAR operations.
Note
----
* not counting padding, skipping or insertions into the reference.
* not counting hard or soft clipped bases of the read
Parameters
----------
cigar : str
Unmodified CIGAR string from SAM file.
Returns
-------
int
Length of the reference sequence.
"""
regex = re.compile("(\d+)([MIDNSHP=X])")
length = 0
for match in regex.finditer(cigar):
if match.group(2) in ("M", "=", "X", "D"):
length += int(match.group(1))
return length
[docs]def calc_avg_depth(depth_dist, ignore_uncovered=True):
"""Calculate average depth from a coverage distribution.
Optionally ignores uncovered bases (first array element).
Parameters
----------
depth_dist : array_like
Sorted coverage depth distribution.
ignore_uncovered : bool, optional
Ignore bases with zero coverage (first array element), defaults to
True.
"""
sum_bases = 0
sum_cover = 0
for (coverage, count) in enumerate(depth_dist):
if ignore_uncovered:
if coverage == 0:
continue
sum_bases += count
sum_cover += coverage * count
return sum_cover / (sum_bases * 1.0)
[docs]def main():
"""Executable to filter SAM files for references with uneven coverage.
See ``--help`` for details on expected arguments. Takes input from
either STDIN, or optional, or positional arguments. Logs messages to
STDERR and writes processed SAM files to STDOUT.
"""
# parse arguments
parser = argparse.ArgumentParser(description=DESC)
parser.add_argument('-i', '--input',
type=check_sam,
help="specify SAM file to be analysed (default: "
"STDIN)",
required=False)
parser.add_argument('--discard',
type=int,
help="keep or discard entries passing the filter "
"criteria?",
required=False,
default=0)
parser.add_argument('-v', '--verbose',
required=False,
action='store_true',
help='print additional information to stderr')
parser.add_argument('-d', '--debug',
required=False,
action='store_true',
help='print debug messages to stderr')
parser.add_argument('-m', '--modify',
required=False,
action='store_true',
help='modify header by adding PG record')
parser.add_argument('-r', '--read_lengths',
required=False,
action='store_true',
help='analyze read length distributions')
parser.add_argument('-c', '--covered',
required=False,
action='store_true',
help='covered bases only, ignore uncovered bases')
parser.add_argument('-p', '--plot',
required=False,
action='store_true',
help='plot distribution(s)')
parser.add_argument('-l', '--lci_arg',
required=False,
type=check_pos_float_max1,
help='fraction of mean coverage to use for LCI \
integration',
default=0.5)
parser.add_argument('-s', '--stats_file',
required=False,
help="override filename for tab-separated statistics")
acc_thresholds = parser.add_mutually_exclusive_group(required=False)
acc_thresholds.add_argument('--min_cov',
type=check_pos_int,
help='minimum average coverage',
default=0)
acc_thresholds.add_argument('--max_cov',
type=check_pos_int,
help='maximum average coverage',
default=sys.maxsize)
acc_thresholds.add_argument('--min_lci',
type=check_pos_float,
help='minimum LCI',
default=0.0)
acc_thresholds.add_argument('--max_lci',
type=check_pos_float,
help='maximum LCI',
default=1000.0)
acc_thresholds.add_argument('--min_gini',
type=check_pos_float_max1,
help='minimum Gini coefficient',
default=0.0)
acc_thresholds.add_argument('--max_gini',
type=check_pos_float_max1,
help='maximum Gini coefficient',
default=1.0)
(args, remain_args) = parser.parse_known_args()
# configure logging
if args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
else:
log.basicConfig(format="%(levelname)s: %(message)s")
# if args.debug:
# from pprint import pprint
# set filename for statistics output
if args.stats_file:
stats_filename = args.stats_file
elif args.input:
stats_filename = args.input + ".stats.csv"
else:
# this assumes that filename was explicitly set by calling batch script
stats_filename = "${filename}.stats.csv"
# initialize dicts
header = [] # SAM header lines, unmodified
depths = {} # dict of np.array of ints
ref_lengths = {} # dict of ints
read_counts = {} # dict of ints
if args.read_lengths:
read_lengths = {} # dict of dict of ints
read_lengths_total = {} # dict of ints
line_count = 0
positions = {} # dict of list for references and their line numbers
buffer = None # file handle used to buffer complete STDIN
byte_offset = 0 # offset of first line after end of header
line_offset = 0
# parse SAM file and divide reads by reference accession
log.info("START of filtering reads by conservation.")
# parse SAM file
log.info("STEP 1: parsing SAM records.")
try:
"""
open SAM file from either command line argument or STDIN
use temporary file as buffer if reading from STDIN
"""
if args.input:
handle = open(args.input, 'r')
else:
handle = fileinput.input(remain_args)
buffer = tempfile.TemporaryFile(mode='w+')
for line_nr, line in enumerate(handle, 0):
line_count = line_nr + 1
if buffer is not None:
buffer.write(line)
# assuming that headers always precede alignments (SAMv1)
if line.startswith("@"):
if args.modify:
line_offset = line_nr + 1
# store header lines for later modification
header.append(line)
# increase offset until reaching end of header
byte_offset += len(line)
else:
fields = line.split("\t")
"""
# content description
===========================
0 QNAME
1 FLAG
2 RNAME
3 POS
4 MAPQ
5 CIGAR
6 RNEXT
7 PNEXT
8 TLEN
9 SEQ
10 QUAL
...optional fields...
11 AS alignment score
12 NM edit distance to reference
13 ZL reference length
14 ZR "raw score"
15 ZE "expected"
16 ZI percent identify (incl. indels and deaminated)
17 MD mismatching positions
18 NS PMD score
"""
# extract accession from required RNAME field
# (as written by MALT 0.0.3)
rname_parts = fields[2].split("|")
"""
# content
===============
0 gi
1 GenBank ID
2 ref
3 accession
4 tax
5 taxonomy ID
"""
accession = ""
try:
accession = rname_parts[3]
except IndexError:
accession = "unknown"
log.warn("no accession in line %s, assigning read to "
"accession 'unknown'", line_nr + 1)
# extract reference sequence length from optional ZL field
ref_length = 0
try:
zl_parts = fields[13].split(":")
ref_length = int(zl_parts[2])
except IndexError:
log.error("no ZL field in line %s", line_nr + 1)
exit(1)
"""
read ------------------------------
ref -----------------------------------------------------
cover 00000000000011111111111111111111111111111100000000000
pre | covered | post
start stop
"""
# get 1-based start position of read from required POS field
pre = int(fields[3]) - 1
# obtain stop position from required CIGAR field
covered = covered_length_from_cigar(fields[5])
stop = pre + covered
if accession in depths:
read_counts[accession] += 1
for position in range(pre, stop + 1):
if position in depths[accession]:
depths[accession][position] += 1
else:
depths[accession][position] = 1
else:
ref_lengths[accession] = ref_length
read_counts[accession] = 1
for position in range(pre, stop + 1):
depths[accession] = {position: 1}
# optional: read length distribution
if args.read_lengths:
read_length = len(fields[9])
if read_length in read_lengths_total:
read_lengths_total[read_length] += 1
else:
read_lengths_total[read_length] = 1
if accession in read_lengths:
if read_length in read_lengths[accession]:
read_lengths[accession][read_length] += 1
else:
read_lengths[accession][read_length] = 1
else:
read_lengths[accession] = {read_length: 1}
# remember occurence of accession and line number(s)
try:
positions[accession].append(line_nr)
except KeyError:
positions[accession] = [line_nr, ]
finally:
handle.close()
buffer.seek(0)
log.info("%i reads are aligned to %i references", line_count, len(depths))
# get overall read length statistics
if args.read_lengths:
max_length_total = max(read_lengths_total.keys())
min_length_total = min(read_lengths_total.keys())
length_dist_total = np.zeros(
(max_length_total - min_length_total + 1, ), dtype=np.int)
for (length, count) in read_lengths_total.items():
length_dist_total[length - min_length_total] = count
log.info("read lengths range from %i to %i bp",
min_length_total, max_length_total)
# iterate over accessions and calculate statistics for filtering
log.info("STEP 2: determining accessions matching the filter criteria.")
lines = []
max_depths = []
max_lengths = []
with open(stats_filename, "w") as handle:
# create statistics header
header = "accession\treference length [bp]\treads"
if args.read_lengths:
header += "\tmin length [bp]\tmax length [bp]\tmean length [bp]"
header += "\tmin coverage\tmax coverage\tmean coverage"
if args.covered:
header += "\tmin coverage*\tmax coverage*\tmean coverage*"
header += "\tLCI(%.2f)\tGini" % (args.lci_arg)
if args.covered:
header += "\tLCI(%.2f)*\tGini*" % (args.lci_arg)
print(header, file=handle)
for accession in depths:
ref_length = ref_lengths[accession]
read_count = read_counts[accession]
# calculate coverage statistics
max_depth = max(depths[accession].values())
max_depths.append(max_depth)
# determine depth distribution
uncovered_bases = ref_length
depth_dist = np.zeros((max_depth + 1,), dtype=np.int)
for (pos, depth) in depths[accession].items():
depth_dist[depth] += 1
uncovered_bases -= 1
depth_dist[0] = uncovered_bases
if args.debug:
print(depth_dist)
min_depth = 0
if uncovered_bases == 0:
min_depth = min(depths[accession].values())
avg_depth = calc_avg_depth(depth_dist, False)
avg_depth_scaled = avg_depth * 1.0 / max_depth
# convert counts to percentage of total reads (normalize y)
depth_dist_cumsum = np.cumsum(depth_dist)
depth_dist_cumperc = depth_dist_cumsum * 1.0 / (ref_length)
# integrate distribution as lci(0.5)
limit = args.lci_arg * avg_depth
# lci = integral_discrete(depth_dist_cumperc, limit)
lci = integral_scaled(depth_dist_cumperc, limit)
# # calculate Gini coefficient from Lorenz curve
# (x,y) = lorenzify(depth_dist)
# (gini, auc) = get_gini_auc(x, y)
# alternative Lorenz curve using base to base assignments
(u, v) = lorenzify_b2b(depth_dist, False)
if args.debug:
print("x (%i entries): %s" % (len(u), u))
print("y (%i entries): %s" % (len(v), v))
(gini_b2b, auc_b2b) = get_gini_auc(u, v)
# TODO also calculate Theil index, see
# https://en.wikipedia.org/wiki/Theil_Index
# optional: coverage stats for covered bases only
if args.covered:
min_depth_covered = min(depths[accession].values())
if uncovered_bases == 0:
min_depth_covered = min_depth
avg_depth_covered = calc_avg_depth(depth_dist, True)
# avg_depth_covered_scaled = avg_depth_covered * 1.0 / max_depth
depth_dist_covered_cumsum = np.cumsum(depth_dist[1:])
depth_dist_covered_cumperc = (depth_dist_covered_cumsum * 1.0
/ (ref_length - uncovered_bases))
limit_covered = args.lci_arg * avg_depth_covered
lci_covered = integral_scaled(depth_dist_covered_cumperc,
limit_covered)
(m, n) = lorenzify_b2b(depth_dist, True)
if args.debug:
print("x (%i entries): %s" % (len(m), m))
print("y (%i entries): %s" % (len(n), n))
(gini_b2b_covered, auc_b2b_covered) = get_gini_auc(m, n)
# optional: read length stats
if args.read_lengths:
max_length = max(read_lengths[accession].keys())
min_length = min(read_lengths[accession].keys())
avg_length = np.mean(np.fromiter(
read_lengths[accession].keys(), dtype=np.int,
count=len(read_lengths[accession])))
max_lengths.append(max_length)
# determine length distribution
length_dist = np.zeros((max_length + 1,), dtype=np.int)
for (length, count) in read_lengths[accession].items():
length_dist[length] = count
# compile accession statistics
stats = "%s\t%i\t%i" % (accession, ref_length, read_count)
if args.read_lengths:
stats += "\t%i\t%i\t%f" % (min_length, max_length, avg_length)
stats += "\t%i\t%i\t%f" % (min_depth, max_depth, avg_depth)
if args.covered:
stats += "\t%i\t%i\t%f" % (min_depth_covered, max_depth,
avg_depth_covered)
stats += "\t%f\t%f" % (lci, gini_b2b)
if args.covered:
stats += "\t%f\t%f" % (lci_covered, gini_b2b_covered)
print(stats, file=handle)
# apply filters before time-consuming plotting steps
if ((avg_depth < args.min_cov)
or (avg_depth > args.max_cov)
or (lci < args.min_lci)
or (lci > args.max_lci)
or (gini_b2b < args.min_gini)
or (gini_b2b > args.max_gini)):
continue
color = 'blue'
if args.plot:
fig_cover = plt.figure(num=1,
figsize=(24, 12), # inch x inch
dpi=80, # 1 inch = 80 pixels
facecolor='white',
edgecolor='black')
# x = np.arange(depth_dist_cumperc.size) * 1.0 / max_depth
# CD = coverage distribution plot
ax_cd = fig_cover.add_subplot(2, 4, 1)
plot_cd(ax_cd, depth_dist, avg_depth)
# SCD = scaled coverage distribution plot
ax_scd = fig_cover.add_subplot(2, 4, 2)
plot_scd(ax_scd, depth_dist, avg_depth, ref_length, max_depth)
# SCD = scaled coverage distribution plot
ax_scd_covered = fig_cover.add_subplot(2, 4, 3)
plot_scd(ax_scd_covered,
np.append(np.array([0]), depth_dist[1:]),
avg_depth_covered,
ref_length - uncovered_bases,
max_depth)
# # CCD = cumulative coverage distribution plot
# ax_ccd = fig_cover.add_subplot(2, 4, 2)
# plot_ccd(ax_ccd, depth_dist_cumsum, avg_depth)
# # SCCD = scaled cumulative coverage distribution plot
# ax_sccd = fig_cover.add_subplot(2, 4, 3)
# plot_sccd(ax_sccd, depth_dist_cumperc, avg_depth)
# NCCD = normalized cumulative coverage distribution
ax_nccd = fig_cover.add_subplot(2, 4, 4)
plot_nccd(ax_nccd, depth_dist_cumperc, avg_depth, max_depth,
avg_depth_scaled, args.lci_arg, color)
ax_nccd.text(0.05, 0.95,
"lci(%.2f) = %.2f" % (args.lci_arg, lci),
transform=ax_nccd.transAxes)
# # Lorenz curve
# ax_lorenz = fig_cover.add_subplot(2, 4, 5)
# plot_lorenz(ax_lorenz, x, y)
# Lorenz curve base2base
ax_b2b = fig_cover.add_subplot(2, 4, 6)
plot_lorenz_b2b(ax_b2b, u, v)
if args.covered:
ax_b2b_covered = fig_cover.add_subplot(2, 4, 7)
plot_lorenz_b2b(ax_b2b_covered, m, n)
# save plot to png
fig_cover.tight_layout()
fig_cover.savefig(accession + "_coverage.png")
plt.close(fig_cover)
if args.read_lengths:
fig_readlength = plt.figure(num=2,
figsize=(6, 6), # inch x inch
dpi=80, # 1 inch = 80px
facecolor='white',
edgecolor='black')
# RLD = read length distribution
ax_rld = fig_readlength.add_subplot(1, 1, 1)
plot_rld(ax_rld, length_dist, min_length_total,
max_length_total, line_nr + 1, length_dist_total)
# save plot to png
fig_readlength.savefig(accession + "_readlengths.png")
plt.close(fig_readlength)
log.info("accession %s: reference has a length of %i bp and %i "
"aligned reads", accession, ref_length, read_count)
log.info("accession %s: coverage ranges from %ix to %ix, average "
"is %.3fx", accession, min_depth, max_depth, avg_depth)
if args.covered:
log.info("accession %s: coverage for covered bases only "
"ranges from %ix to %ix, average is %.3fx",
accession, min_depth_covered, max_depth,
avg_depth_covered)
if args.read_lengths:
log.info("accession %s: read length ranges from %i bp to %i "
"bp, average is %.3f bp",
accession, min_length, max_length, avg_length)
# finally print accession for later filtering steps
lines.extend(positions[accession])
if args.debug:
log.info("filtering accession: %s", accession)
# re-open file if not a buffered stream
if buffer is None:
buffer = open(args.input, 'r')
# skip original headers and sneak in modified headers instead
if args.modify:
log.info("Writing new @PG record to SAM header.")
cline = " ".join(sys.argv[1:])
modified_header = modify_header(
header,
name=splitext(basename(__file__))[0],
version=__version__,
command_line=cline,
description=TEXT
)
extra_headers = len(modified_header) - len(header) # should be 1
for record in modified_header:
print(record.rstrip(), file=sys.stdout)
# fast forward buffer to beginning of alignments
buffer.seek(byte_offset)
if args.debug:
log.info("%i new line added to header lines.", extra_headers)
# keep or remove accessions from SAM file
alt = ["discarding", "keeping"]
log.info("STEP 3: %s %i unevenly covered references from SAM file.",
alt[args.discard], len(lines))
line_filter(lines, buffer, discard=(args.discard == 0),
offset=line_offset)
buffer.close()
log.info("END of filtering references by coverage, statistics saved to "
"file %s", stats_filename)
exit()
if __name__ == "__main__":
main()