Source code for samsifter.tools.count_taxon_reads

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Analysing filter step to count reads per taxon.

.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""

import sys
if not sys.version_info[0] >= 3:
    print("Error, I need python 3.x or newer")
    exit(1)

import argparse
import logging as log
import fileinput
from os.path import basename, splitext, isfile

# custom libraries
from samsifter.models.filter import FilterItem
from samsifter.models.parameter import FilterParameter, FilterFilepath
from samsifter.util.arg_sanitation import check_sam

# global variables
TEXT = "Count reads per taxon"
DESC = ("Counts reads per taxon ID and stores them in a separate statistics "
        "file.")


[docs]def item(): """Create item representing this tool in list and tree views. Returns ------- FilterItem Item for use in item-based list and tree views. """ filter_item = FilterItem(text=TEXT, desc=DESC, icon=FilterItem.ICON_ANALYZER) filter_item.set_command(splitext(basename(__file__))[0]) filter_item.add_parameter(FilterParameter( text="verbose", desc="print additional information to STDERR", cli_name="--verbose", default=True, active=True )) filter_item.add_parameter(FilterParameter( text="debug", desc="print debug messages to STDERR", cli_name="--debug", default=False )) filter_item.add_parameter(FilterFilepath( text="prefix", desc="prefix of temporary statistics files", cli_name="--prefix", default='${filename}', # crucial for clean batch processing! active=True )) # input/output requirements filter_item.set_input_format('SAM') filter_item.set_input_sorting('any') filter_item.set_input_compression('uncompressed') filter_item.set_output_format('SAM') filter_item.set_output_sorting('as_input') filter_item.set_output_compression('uncompressed') return filter_item
[docs]def main(): """Analyzing filter step to count reads per taxon.""" # parse arguments parser = argparse.ArgumentParser(description=DESC) parser.add_argument('-i', '--input', type=check_sam, help="specify SAM file to be analysed (default: " "STDIN)", required=False) parser.add_argument('-p', '--prefix', required=False, default='reads_per_taxon', help='prefix of temporary statistics files') parser.add_argument('-v', '--verbose', required=False, action='store_true', help='print additional information to STDERR') parser.add_argument('-d', '--debug', required=False, action='store_true', help='print debug messages to STDERR') (args, remain_args) = parser.parse_known_args() # configure logging if args.verbose: log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG) else: log.basicConfig(format="%(levelname)s: %(message)s") log.info("START of counting reads per taxon.") # initialize dict counts = {} # parse SAM file log.info("STEP 1: parsing SAM records.") try: # open SAM file from either command line argument or STDIN if args.input: handle = open(args.input, 'r') else: handle = fileinput.input(remain_args) for line_nr, line in enumerate(handle, 1): if not line.startswith("@"): fields = line.split("\t") """ # content description =========================== 0 QNAME 1 FLAG 2 RNAME 3 POS 4 MAPQ 5 CIGAR 6 RNEXT 7 PNEXT 8 TLEN 9 SEQ 10 QUAL ...optional fields... 11 AS alignment score 12 NM edit distance to reference 13 ZL reference length 14 ZR "raw score" 15 ZE "expected" 16 ZI percent identify (incl. indels & deaminated) 17 MD mismatching positions 18 NS PMD score """ # extract taxon ID from required RNAME field # (as written by MALT 0.0.3) rname_parts = fields[2].split("|") """ # content =============== 0 gi 1 GenBank ID 2 ref 3 accession 4 tax 5 taxonomy ID """ try: taxon_id = rname_parts[5] except IndexError: taxon_id = 'unknown' log.warn("no taxon ID in line %s, assigning read to " "taxon 'unknown'", line_nr) # add to counts try: counts[taxon_id] += 1 except KeyError: counts[taxon_id] = 1 print(line.rstrip(), file=sys.stdout) finally: handle.close() # check if previous statistic files exist and choose filename accordingly suffix = 'csv' # assuming that 999 filter steps will suffice for idx in range(1, 1000): stats_filename = '%s.%03d.%s' % (args.prefix, idx, suffix) if not isfile(stats_filename): break log.info("STEP 2: writing statistics to %s.", stats_filename) with open(stats_filename, mode='w') as handle: print("'taxon_id','read_count'", file=handle) for key, value in sorted(counts.items()): print("'%s',%i" % (key, value), file=handle) log.info("END of counting reads per taxon.") exit()
if __name__ == "__main__": main()