#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Filter references with high attribution of ancient reads in a MALT'ed and
PMD'ed SAM file
.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""
import sys
if not sys.version_info[0] >= 3:
print("Error, I need python 3.x or newer")
exit(1)
import argparse
import logging as log
import fileinput
import tempfile
from os.path import basename, splitext
# custom libraries
from samsifter.models.filter import FilterItem
from samsifter.models.parameter import (
FilterParameter, FilterSwitch, FilterThreshold
)
from samsifter.util.arg_sanitation import (
check_pos_int, check_sam, check_percent
)
from samsifter.util.filters import line_filter
from samsifter.util.papertrail import modify_header
from samsifter.version import VERSION
__version__ = VERSION
# global variables
TEXT = "Filter references by PMD score"
DESC = ("Filter taxa with high attribution of ancient reads in a MALT'ed and "
"PMD'ed SAM file")
[docs]def item():
"""Create item representing this tool in list and tree views.
Returns
-------
FilterItem
Item for use in item-based list and tree views.
"""
filter_item = FilterItem(text=TEXT, desc=DESC)
filter_item.set_command(splitext(basename(__file__))[0])
filter_item.add_parameter(FilterThreshold(
text="min. PMDS",
desc="minimum post-mortem degradation score (PMDS)",
cli_name="--pmds",
default=3,
minimum=-10,
maximum=10,
precision=0,
required=True,
active=True
))
filter_item.add_parameter(FilterThreshold(
text="min. reads",
desc="minimum percentage of reads exceeding PMD threshold [50.0]",
cli_name="--reads_perc",
default=50.0,
unit="%",
required=True,
active=True
))
filter_item.add_parameter(FilterSwitch(
text="filter direction",
desc="Keep or discard entries passing the filter criteria?",
cli_name="--discard",
default=0,
options=["discard", "keep"]
))
filter_item.add_parameter(FilterParameter(
text="verbose",
desc="print additional information to STDERR",
cli_name="--verbose",
default=True,
active=True
))
filter_item.add_parameter(FilterParameter(
text="debug",
desc="print debug messages to STDERR",
cli_name="--debug",
default=False
))
filter_item.add_parameter(FilterParameter(
text="write PG tag",
desc="annotate the filtered SAM file by adding a PG tag to the header",
cli_name="--modify",
default=True,
active=True
))
return filter_item
[docs]def main():
"""Executable to filter SAM files for references with ancient reads.
See ``--help`` for details on expected arguments. Takes input from
either STDIN, or optional, or positional arguments. Logs messages to
STDERR and writes processed SAM files to STDOUT.
"""
# parse arguments
parser = argparse.ArgumentParser(description=DESC)
parser.add_argument('-i', '--input',
type=check_sam,
help="specify SAM file to be analysed (default: "
"STDIN)",
required=False)
parser.add_argument('--discard',
type=int,
help="keep or discard entries passing the filter "
"criteria?",
required=False,
default=0)
parser.add_argument('-v', '--verbose',
action='store_true',
help='print additional information to stderr')
parser.add_argument('-d', '--debug',
required=False,
action='store_true',
help='print debug messages to stderr')
parser.add_argument('-m', '--modify',
required=False,
action='store_true',
help='modify header by adding PG record')
parser.add_argument('-p', '--pmds',
required=True,
type=float,
help='post-mortem degradation score (PMDS) threshold',
default=3)
read_thresholds = parser.add_mutually_exclusive_group(required=True)
read_thresholds.add_argument('-rp', '--reads_perc',
type=check_percent,
help='attributed read threshold (percent)',
default=50.0)
read_thresholds.add_argument('-rt', '--reads_total',
type=check_pos_int,
help='attributed read threshold (total)')
(args, remain_args) = parser.parse_known_args()
# configure logging
if args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
else:
log.basicConfig(format="%(levelname)s: %(message)s")
# initialize dicts and variables
header = [] # SAM header lines, unmodified
total_read_counts = {}
ancient_read_counts = {}
positions = {} # dict of list for references and their line numbers
buffer = None # file handle used to buffer complete STDIN
byte_offset = 0 # offset of first line after end of header
line_offset = 0
log.info("START of filtering references by PMDS.")
# parse SAM file
log.info("STEP 1: parsing SAM records.")
try:
"""
open SAM file from either command line argument or STDIN
use temporary file as buffer if reading from STDIN
"""
if args.input:
handle = open(args.input, 'r')
else:
handle = fileinput.input(remain_args)
buffer = tempfile.TemporaryFile(mode='w+')
for line_nr, line in enumerate(handle, 0):
if buffer is not None:
buffer.write(line)
if line.startswith("@"):
if args.modify:
line_offset = line_nr + 1
# store header lines for later modification
header.append(line)
# increase offset until reaching end of header
byte_offset += len(line)
else:
fields = line.split("\t")
"""
# content
====================
0 QNAME
1 FLAG
2 RNAME
3 POS
4 MAPQ
5 CIGAR
6 RNEXT
7 PNEXT
8 TLEN
9 SEQ
10 QUAL
...optional fields...
18 NS
"""
# extract taxon ID from required RNAME field (written by MALT)
rname_parts = fields[2].split("|")
"""
# content
===============
0 gi
1 GenBank ID
2 ref
3 accession
4 tax
5 taxonomy ID
"""
accession = ""
try:
accession = rname_parts[3]
except IndexError:
accession = "unknown"
log.warn("no accession in line %s, assigning read to "
"accession 'unknown'", line_nr + 1)
# extract PMD score from optional NS field
# (written by pmdtools)
pmds = 0
try:
ns_parts = fields[18].split(":")
pmds = float(ns_parts[2])
except ValueError:
log.error("invalid PMD score in line %i: %s",
line_nr + 1, ns_parts[2])
exit(1)
except IndexError:
log.error(
"no PMD score in line %i, run 'pmdtools --dry "
"--header --writesamfield' first", line_nr + 1)
exit(1)
# count and filter reads by PMDS
if accession in total_read_counts:
total_read_counts[accession] += 1
else:
total_read_counts[accession] = 1
if pmds >= args.pmds:
if accession in ancient_read_counts:
ancient_read_counts[accession] += 1
else:
ancient_read_counts[accession] = 1
# remember occurence of accession and line number(s)
try:
positions[accession].append(line_nr)
except KeyError:
positions[accession] = [line_nr, ]
finally:
handle.close()
buffer.seek(0)
# filter references
log.info("STEP 2: determining references matching the filter criteria.")
lines = []
for (accession, ancient_read_count) in sorted(ancient_read_counts.items()):
if args.reads_total:
if ancient_read_count >= args.reads_total:
lines.extend(positions[accession])
if args.debug:
log.info("filtering accession: %s", accession)
elif args.reads_perc:
perc = ancient_read_count / total_read_counts[accession] * 100.0
if perc >= args.reads_perc:
lines.extend(positions[accession])
if args.debug:
log.info("filtering accession: %s", accession)
# re-open file if not a buffered stream
if buffer is None:
buffer = open(args.input, 'r')
# skip original headers and sneak in modified headers instead
if args.modify:
log.info("Writing new @PG record to SAM header.")
cline = " ".join(sys.argv[1:])
modified_header = modify_header(
header,
name=splitext(basename(__file__))[0],
version=__version__,
command_line=cline,
description=TEXT
)
extra_headers = len(modified_header) - len(header) # should be 1
for record in modified_header:
print(record.rstrip(), file=sys.stdout)
# fast forward buffer to beginning of alignments
buffer.seek(byte_offset)
if args.debug:
log.info("%i new line added to header lines.", extra_headers)
# keep or remove accessions from SAM file
alt = ["discarding", "keeping"]
log.info("STEP 3: %s %i references from SAM file.",
alt[args.discard], len(lines))
line_filter(lines, buffer, discard=(args.discard == 0),
offset=line_offset)
buffer.close()
log.info("END of filtering references by PMDS.")
exit()
if __name__ == "__main__":
main()