Source code for samsifter.stats.compile_stats

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Compilation of statistics for a SAM file.

Usually executed after processing a SAM file with a workflow containing steps
that produce temporary statistics files, eg. the
:py:mod:`samsifter.tools.count_taxon_reads` module. The script takes all
temporary files in the working directory, sorts them by their filenames and
appends their values as new columns to statistics spreadsheet named after the
original input file.

Both the SamSifter GUI as well as the Bash scripts exported from it will
execute this script by default and remove all temporary statistics files when
done.

Note
----
See the script ``enrich_summary`` for details on how to enrich the resulting
file with data from external databases like IMG/M.

See the script ``summarize_stats`` for details on how to summarize multiple
statistics files to quickly inspect the results of a batch run.


.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""
import sys
if not sys.version_info[0] >= 3:
    print("Error, I need python 3.x or newer")
    exit(1)

import argparse
import logging as log
from os.path import isfile
from os import listdir, remove
import re
import pandas as pd
import numpy as np
pd.set_option('max_columns', 50)


[docs]def main():
    """Executes the compilation of temporary statistics files.

    See ``--help`` for details on expected arguments.
    """
    # parse arguments
    parser = argparse.ArgumentParser(description="compile statistics from "
                                     "temporary files")
    parser.add_argument('-v', '--verbose',
                        required=False,
                        action='store_true',
                        help='print additional information to STDERR')
    parser.add_argument('-d', '--debug',
                        required=False,
                        action='store_true',
                        help='print debug messages to STDERR')
    parser.add_argument('-r', '--remove',
                        required=False,
                        action='store_true',
                        help='remove temporary statistics files after use')
    parser.add_argument('-p', '--prefix',
                        required=False,
                        default='reads_per_taxon',
                        help='prefix of temporary statistics files')
    (args, remain_args) = parser.parse_known_args()

    # configure logging
    if args.verbose:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
    else:
        log.basicConfig(format="%(levelname)s: %(message)s")

    # create dict of stats files in working dir
    # using dict instead of list as steps may be omitted
    steps = {}
    for entry in listdir():
        if isfile(entry):
            match = re.match('%s\.(\d{3})\.csv$' % args.prefix, entry)
            if match:
                index = int(match.group(1))
                steps[index] = entry
    log.info("Found %i temporary statistic files with prefix '%s'.",
             len(steps), args.prefix)

    if len(steps) > 0:
        readcounts = pd.DataFrame(dtype=np.float)
        # sort them by consecutive number
        for idx, filename in sorted(steps.items()):
            log.info("Gathering data from step %i (%s)", idx, filename)
            df = pd.read_csv(filename,
                             sep=',',
                             # index_col=0,      # better set index explicitly
                             # engine='python',  # C is faster, supports dtype
                             engine='c',
                             dtype={'taxon_id': str, 'read_count': np.float},
                             quotechar="'",
                             quoting=2)
            # pylint: disable=E1101
            df = df.set_index('taxon_id', drop=False)   # dynamic member...
            # pylint: enable=E1101
            # merge stats into sparse array, assumes we never gain new taxa
            readcounts['read_count_%i' % idx] = df['read_count']
        # save array to CSV
        readcounts.to_csv(sys.stdout,
                          sep=',',
                          header=True,
                          na_rep=0.0,
                          quotechar="'",
                          quoting=2)
    # remove stats files
    if args.remove:
        for filename in steps.values():
            remove(filename)
    exit()


if __name__ == "__main__":
    main()
Navigation

Source code for samsifter.stats.compile_stats

Quick search

Navigation