Source code for samsifter.tools.filter_ref_pmds

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Filter references with high attribution of ancient reads in a MALT'ed and
PMD'ed SAM file

.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""

import sys
if not sys.version_info[0] >= 3:
    print("Error, I need python 3.x or newer")
    exit(1)

import argparse
import logging as log
import fileinput
import tempfile
from os.path import basename, splitext

# custom libraries
from samsifter.models.filter import FilterItem
from samsifter.models.parameter import (
    FilterParameter, FilterSwitch, FilterThreshold
)
from samsifter.util.arg_sanitation import (
    check_pos_int, check_sam, check_percent
)
from samsifter.util.filters import line_filter
from samsifter.util.papertrail import modify_header
from samsifter.version import VERSION

__version__ = VERSION

# global variables
TEXT = "Filter references by PMD score"
DESC = ("Filter taxa with high attribution of ancient reads in a MALT'ed and "
        "PMD'ed SAM file")


[docs]def item():
    """Create item representing this tool in list and tree views.

    Returns
    -------
    FilterItem
        Item for use in item-based list and tree views.
    """
    filter_item = FilterItem(text=TEXT, desc=DESC)
    filter_item.set_command(splitext(basename(__file__))[0])

    filter_item.add_parameter(FilterThreshold(
        text="min. PMDS",
        desc="minimum post-mortem degradation score (PMDS)",
        cli_name="--pmds",
        default=3,
        minimum=-10,
        maximum=10,
        precision=0,
        required=True,
        active=True
    ))

    filter_item.add_parameter(FilterThreshold(
        text="min. reads",
        desc="minimum percentage of reads exceeding PMD threshold [50.0]",
        cli_name="--reads_perc",
        default=50.0,
        unit="%",
        required=True,
        active=True
    ))

    filter_item.add_parameter(FilterSwitch(
        text="filter direction",
        desc="Keep or discard entries passing the filter criteria?",
        cli_name="--discard",
        default=0,
        options=["discard", "keep"]
    ))

    filter_item.add_parameter(FilterParameter(
        text="verbose",
        desc="print additional information to STDERR",
        cli_name="--verbose",
        default=True,
        active=True
    ))

    filter_item.add_parameter(FilterParameter(
        text="debug",
        desc="print debug messages to STDERR",
        cli_name="--debug",
        default=False
    ))

    filter_item.add_parameter(FilterParameter(
        text="write PG tag",
        desc="annotate the filtered SAM file by adding a PG tag to the header",
        cli_name="--modify",
        default=True,
        active=True
    ))

    return filter_item


[docs]def main():
    """Executable to filter SAM files for references with ancient reads.

    See ``--help`` for details on expected arguments. Takes input from
    either STDIN, or optional, or positional arguments. Logs messages to
    STDERR and writes processed SAM files to STDOUT.
    """
    # parse arguments
    parser = argparse.ArgumentParser(description=DESC)
    parser.add_argument('-i', '--input',
                        type=check_sam,
                        help="specify SAM file to be analysed (default: "
                        "STDIN)",
                        required=False)
    parser.add_argument('--discard',
                        type=int,
                        help="keep or discard entries passing the filter "
                             "criteria?",
                        required=False,
                        default=0)
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        help='print additional information to stderr')
    parser.add_argument('-d', '--debug',
                        required=False,
                        action='store_true',
                        help='print debug messages to stderr')
    parser.add_argument('-m', '--modify',
                        required=False,
                        action='store_true',
                        help='modify header by adding PG record')
    parser.add_argument('-p', '--pmds',
                        required=True,
                        type=float,
                        help='post-mortem degradation score (PMDS) threshold',
                        default=3)
    read_thresholds = parser.add_mutually_exclusive_group(required=True)
    read_thresholds.add_argument('-rp', '--reads_perc',
                                 type=check_percent,
                                 help='attributed read threshold (percent)',
                                 default=50.0)
    read_thresholds.add_argument('-rt', '--reads_total',
                                 type=check_pos_int,
                                 help='attributed read threshold (total)')
    (args, remain_args) = parser.parse_known_args()

    # configure logging
    if args.verbose:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
    else:
        log.basicConfig(format="%(levelname)s: %(message)s")

    # initialize dicts and variables
    header = []      # SAM header lines, unmodified
    total_read_counts = {}
    ancient_read_counts = {}
    positions = {}   # dict of list for references and their line numbers
    buffer = None    # file handle used to buffer complete STDIN
    byte_offset = 0  # offset of first line after end of header
    line_offset = 0

    log.info("START of filtering references by PMDS.")

    # parse SAM file
    log.info("STEP 1: parsing SAM records.")
    try:
        """
        open SAM file from either command line argument or STDIN
        use temporary file as buffer if reading from STDIN
        """
        if args.input:
            handle = open(args.input, 'r')
        else:
            handle = fileinput.input(remain_args)
            buffer = tempfile.TemporaryFile(mode='w+')

        for line_nr, line in enumerate(handle, 0):
            if buffer is not None:
                buffer.write(line)

            if line.startswith("@"):
                if args.modify:
                    line_offset = line_nr + 1
                    # store header lines for later modification
                    header.append(line)
                    # increase offset until reaching end of header
                    byte_offset += len(line)
            else:
                fields = line.split("\t")
                """
                #   content
                ====================
                0   QNAME
                1   FLAG
                2   RNAME
                3   POS
                4   MAPQ
                5   CIGAR
                6   RNEXT
                7   PNEXT
                8   TLEN
                9   SEQ
                10  QUAL
                ...optional fields...
                18  NS
                """

                # extract taxon ID from required RNAME field (written by MALT)
                rname_parts = fields[2].split("|")
                """
                #   content
                ===============
                0   gi
                1   GenBank ID
                2   ref
                3   accession
                4   tax
                5   taxonomy ID
                """
                accession = ""
                try:
                    accession = rname_parts[3]
                except IndexError:
                    accession = "unknown"
                    log.warn("no accession in line %s, assigning read to "
                             "accession 'unknown'", line_nr + 1)

                # extract PMD score from optional NS field
                # (written by pmdtools)
                pmds = 0
                try:
                    ns_parts = fields[18].split(":")
                    pmds = float(ns_parts[2])
                except ValueError:
                    log.error("invalid PMD score in line %i: %s",
                              line_nr + 1, ns_parts[2])
                    exit(1)
                except IndexError:
                    log.error(
                        "no PMD score in line %i, run 'pmdtools --dry "
                        "--header --writesamfield' first", line_nr + 1)
                    exit(1)

                # count and filter reads by PMDS
                if accession in total_read_counts:
                    total_read_counts[accession] += 1
                else:
                    total_read_counts[accession] = 1

                if pmds >= args.pmds:
                    if accession in ancient_read_counts:
                        ancient_read_counts[accession] += 1
                    else:
                        ancient_read_counts[accession] = 1

                # remember occurence of accession and line number(s)
                try:
                    positions[accession].append(line_nr)
                except KeyError:
                    positions[accession] = [line_nr, ]
    finally:
        handle.close()
        buffer.seek(0)

    # filter references
    log.info("STEP 2: determining references matching the filter criteria.")
    lines = []
    for (accession, ancient_read_count) in sorted(ancient_read_counts.items()):
        if args.reads_total:
            if ancient_read_count >= args.reads_total:
                lines.extend(positions[accession])
                if args.debug:
                    log.info("filtering accession: %s", accession)
        elif args.reads_perc:
            perc = ancient_read_count / total_read_counts[accession] * 100.0
            if perc >= args.reads_perc:
                lines.extend(positions[accession])
                if args.debug:
                    log.info("filtering accession: %s", accession)

    # re-open file if not a buffered stream
    if buffer is None:
        buffer = open(args.input, 'r')

    # skip original headers and sneak in modified headers instead
    if args.modify:
        log.info("Writing new @PG record to SAM header.")
        cline = " ".join(sys.argv[1:])
        modified_header = modify_header(
            header,
            name=splitext(basename(__file__))[0],
            version=__version__,
            command_line=cline,
            description=TEXT
        )
        extra_headers = len(modified_header) - len(header)    # should be 1
        for record in modified_header:
            print(record.rstrip(), file=sys.stdout)
        # fast forward buffer to beginning of alignments
        buffer.seek(byte_offset)

        if args.debug:
            log.info("%i new line added to header lines.", extra_headers)

    # keep or remove accessions from SAM file
    alt = ["discarding", "keeping"]
    log.info("STEP 3: %s %i references from SAM file.",
             alt[args.discard], len(lines))

    line_filter(lines, buffer, discard=(args.discard == 0),
                offset=line_offset)
    buffer.close()

    log.info("END of filtering references by PMDS.")

    exit()

if __name__ == "__main__":
    main()
Navigation

Source code for samsifter.tools.filter_ref_pmds

Quick search

Navigation