Source code for samsifter.stats.summarize_stats

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Compilation of statistics for multiple files into a summary spreadsheet.

Usually executed after batch processing of multiple SAM files with the same
workflow. The script takes the final read count of each statistics file and
adds it to a new spreadsheet using the input filename as header.

All Bash scripts exported from the SamSifter GUI that are processing multiple
files (sequential or parallel processing mode) will execute this script by
default and remove all temporary statistics files when done.

Note
----
See the script ``enrich_summary.py`` for details on how to enrich this summary
with data from external databases like IMG/M.


.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""
import sys
if not sys.version_info[0] >= 3:
    print("Error, I need python 3.x or newer")
    exit(1)

import argparse
import logging as log
from os.path import isfile
from os import listdir
import re
import pandas as pd
import numpy as np


[docs]def main():
    """Executes the summarization of multiple statistics files.

    See ``--help`` for details on expected arguments.
    """
    # parse arguments
    parser = argparse.ArgumentParser(description="compile statistics from "
                                     "temporary files")
    parser.add_argument('-v', '--verbose',
                        required=False,
                        action='store_true',
                        help='print additional information to STDERR')
    parser.add_argument('-d', '--debug',
                        required=False,
                        action='store_true',
                        help='print debug messages to STDERR')
    parser.add_argument('-p', '--prefix',
                        required=False,
                        default='reads_per_taxon',
                        help='prefix of temporary statistics files')
    (args, remain_args) = parser.parse_known_args()

    # configure logging
    if args.verbose:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
    else:
        log.basicConfig(format="%(levelname)s: %(message)s")

    # create list of stats files in working dir
    files = []
    for entry in listdir():
        if isfile(entry):
            match = re.match('.*\.sifted\.csv$', entry)
            if match:
                files.append(match.group(0))
    log.info("Found %i statistic files.", len(files))

    if len(files) > 0:
        summary = pd.DataFrame(dtype=np.float)
        for filename in sorted(files):
            log.info("Gathering data from file %s", filename)
            # read file
            df = pd.read_csv(filename,
                             sep=',',
                             # index_col=0,      # better set index explicitly
                             # engine='python',  # C is faster, supports dtype
                             engine='c',
                             dtype={'taxon_id': str},
                             quotechar="'",
                             quoting=2)
            # pylint: disable=E1101
            df = df.set_index('taxon_id', drop=False)   # dynamic member...
            # pylint: enable=E1101
            # identify last column
            redux = df.ix[:, -1:]
            redux.columns = [filename]
            # filter taxa without reads
            filtered = redux[redux[filename] > 0]
            # merge it with summary file (outer join)
            summary = pd.concat([summary, filtered], axis=1)

        # save summary to CSV
        summary.to_csv(sys.stdout,
                       sep=',',
                       header=True,
                       # na_rep=0.0,
                       quotechar="'",
                       quoting=2)

    exit()

if __name__ == "__main__":
    main()
Navigation

Source code for samsifter.stats.summarize_stats

Quick search

Navigation