Source code for pmdtools.pmdtools_mod

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Calculate and filter post-mortem degeneration score for NGS reads in SAM files.

Modified version of PMDtools based on v0.55 by Pontus Skoglund:

cite: P Skoglund, BH Northoff, MV Shunkov, AP Derevianko, S Paabo, J Krause, M
Jakobsson (2014) Separating endogenous ancient DNA from modern day
contamination in a Siberian Neandertal, PNAS, advance online 27 January

Included changes:

* count and show excluded reads due to unsupported CIGAR operations
* count and show excluded reads due to % identity filtering
* port from Python2 to Python3
* bugfix: ambiguous combinations of CIGAR and MD with deletions lead to shifts
  in reconstructed reference sequence
* support for all CIGAR operations except padded references (untested)
* support of IUPAC ambiguity codes
* optional parameters to adapt % identity calculation to MALT defaults
* refactoring into main method
* use of ArgumentParser over deprecated OptionParser
* improved PEP8 and pylint compliance
* ReST documentation

.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""

import sys
import argparse
import math
import subprocess
from itertools import repeat

# custom libraries
from samsifter.util.sam import reconstruct
from samsifter.util.genetics import reverse_complement, gc, aln_identity


[docs]def phred2prob(quality):
    """Convert PHRED score to probability of sequencing error.

    Parameters
    ----------
    quality : int
        PHRED score.

    Returns
    -------
    float
        The probability of a sequencing error.
    """
    return 10.0 ** (-quality / 10.0)


[docs]def prob2phred(probability):
    """Convert probability of sequencing error to PHRED score.

    Parameters
    ----------
    probability : float
        The probability of a sequencing error.

    Returns
    -------
    float
        PHRED score.
    """
    return -10.0 * math.log(probability, 10)


[docs]def prob2ascii(probability, offset=33):
    """Convert probability of sequencing error to ASCII-encoded PHRED score.

    By default uses ASCII offset 33 (Illumina standard).

    Parameters
    ----------
    probability : float
        The probability of a sequencing error.
    offset : int, optional
        ASCII encoding offset, defaults to 33.
    """
    return chr(int(prob2phred(probability)) + offset)


[docs]def prob(pos, model, qual, poly, match=True):
    """Calculate probability of either match or mismatch under given model.

    Expects distance of base from end of read, a pre-computed distribution of
    damage probabilities, PHRED score of base (probability of sequencing
    error) and probability of true polymorphism.

    Parameters
    ----------
    pos : int
        Position of base from end of read.
    model : list
        Distribution of damage probabilities.
    qual : int
        PHRED score of base.
    poly : float
        Probability of a true polymorphism at this base.
    match : bool, optional
        Switch between calculation for a match (default) or mismatch.

    Returns
    -------
    float
        Probability of either match or mismatch under given model.
    """
    p_damage = float(model[pos])
    p_error = phred2prob((ord(qual) - 33)) / 3.0
    p_poly = poly
    p_match = ((1.0 - p_damage) * (1.0 - p_error) * (1.0 - p_poly)
               + (p_damage * p_error * (1.0 - p_poly))
               + (p_error * p_poly * (1.0 - p_damage)))
    if match:
        return p_match
    else:
        return 1.0 - p_match


[docs]def likelihood_match(position, model, qual, poly):
    """Calculate likelihood of a match under given model.

    Parameters
    ----------
    pos : int
        Position of base from end of read.
    model : list
        Distribution of damage probabilities.
    qual : int
        PHRED score of base.
    poly : float
        Probability of a true polymorphism at this base.

    Returns
    -------
    float
        Probability of match under given model.
    """
    return prob(position, model, qual, poly, True)


[docs]def likelihood_mismatch(position, model, qual, poly):
    """Calculate likelihood of a mismatch under given model.

    Parameters
    ----------
    pos : int
        Position of base from end of read.
    model : list
        Distribution of damage probabilities.
    qual : int
        PHRED score of base.
    poly : float
        Probability of a true polymorphism at this base.

    Returns
    -------
    float
        Probability of mismatch under given model.
    """
    return prob(position, model, qual, poly, False)


[docs]def adjust_quality(position, model, qual):
    """Adjust base quality value (sequencing error probability).

    Parameters
    ----------
    position : int
        Position of base from end of read.
    model : list
        Distribution of damage probabilities.
    qual : int
        PHRED score of base.

    Returns
    -------
    float
        Adjusted base quality value.
    """
    p_damage = float(model[position])
    p_error = phred2prob((ord(qual) - 33)) / 3.0
    adjusted = 1.0 - ((1.0 - p_damage) * (1.0 - p_error))
    return adjusted


[docs]def geometric(pval, kval, constant):
    """Calculate geometrically distributed probability.

    Parameters
    ----------
    pval : float
        Parameter p of geometric distribution.
    kval : float
        Parameter k of geometric distribution.
    constant : float
        Constant value of geometric distribution.

    Returns
    -------
    float
        Probability of deamination based on geometric distribution.
    """
    return ((1.0 - pval) ** (kval - 1)) * pval + constant


[docs]def fa_get(ffile, chrom, fstart, fend, samtoolspath, verbose=False):
    """Unused method stub to get reference sequence from supplied FASTA?

    Calls SAMtools in subprocess to create index of FASTA file.

    Parameters
    ----------
    ffile : str
        Path to FASTA file.
    chrom : int
        Chromosome number.
    fstart : int
        Start position.
    fend : int
        End position.
    samtoolspath : str
        Path to samtools.
    verbose : bool, optional
        Enable output of executed commandline, defaults to False.

    Returns
    -------
    str
        reference sequence
    """
    fnull = open('/dev/null', 'w')
    chrom = str(chrom)    # 'chr'+str(chrom)
    location = str(chrom) + ':' + str(fstart) + '-' + str(fend)
    cmd_line = [samtoolspath, 'faidx', ffile, location]
    outp_file = subprocess.Popen(cmd_line, stdout=subprocess.PIPE,
                                 stderr=fnull)
    pileupline = outp_file.stdout.read().split()
    pileupline = ''.join(pileupline[1:])
    if len(pileupline) < 1:
        print('no such reference sequence', cmd_line)
    if verbose:
        print(' '.join(cmd_line))
    return pileupline


[docs]def score_pmd(read, ref, quals, ancient_model_deam, modern_model_deam,
              adjustment_model_deam=None,
              polymorphism_contamination=0.001, polymorphism_ancient=0.001,
              adjustbaseq_all=False,
              adjustbaseq=False,
              baseq=0,
              deamination=False,
              cpg=False,
              nocpg=False,
              udg_half=False,
              pmds_prim=False):
    """Calculate post-mortem degradation score (PMDS).

    Requires full read and reference sequence including clips, skips and gaps
    as recovered from CIGAR and MD to use the correct distance from 5' or 3'
    end in the PMDS calculation. However only the aligned read bases with
    PHRED scores are considered for PMD scoring.

    Optional adjustment of base qualities requires additional adjustment model.

    Parameters
    ----------
    read : str
        Full read sequence.
    ref : str
        Aligned reference sequence including clips, skips and gaps.
    quals : list
        Quality scores for aligned bases.
    ancient_model_deam : list
        Probability distribution for ancient deamination.
    modern_model_deam : list
        Probability distribution for modern deamination.
    adjustment_model_deam : list, optional
        Adjustments to the deamination probabilities, defaults to None.
    polymorphism_contamination : float, optional
        Probability of polymorphism due to contamination, defaults to 0.001.
    polymorphism_ancient : float, optional
        Probability of polymorphism due to age, defaults to 0.001.
    adjustbaseq_all : bool, optional
        Switch to adjust all base quality values, defaults to False.
    adjustbaseq : bool, optional
        Switch to adjust affected base quality values, defaults to False.
    baseq : int, optional
        Minimum quality value for processed bases, defaults to 0
    deamination : bool, optional
        Enable output of base frequencies in deaminated context, defaults to
        False.
    cpg : bool, optional
        Only use Cs and Gs in CpG context, defaults to False.
    nocpg : bool, optional
        Do NOT use Cs and Gs in CpG context, defaults to False.
    udg_half : bool, optional
        Only use Cs and Gs in CpG context, the first and last base are used
        regardless of dinucleotide context. Defaults to False.
    pmds_prim : bool, optional
        Defaults to False.

    Returns
    -------
    float
        Likelihood of deamination under ancient DNA model.
    float
        Likelihood of deamination under ancient DNA model.
    float
        Post-mortem degradation score
    list
        Adjusted base qualities.
    """
    # this should never happen if reference was correctly reconstructed
    if len(read) != len(ref):
        raise ValueError("Undefined for sequences of unequal length")

    l_degrad = 1.0    # likelihood of degradation
    l_mut = 1.0    # likelihood of mutation

#    l_degrad_max = 1.0
#    l_mut_max = 1.0
#
#    l_degrad_min = 1.0
#    l_mut_min = 1.0

#    # Check for informative sites (does nothing at the moment)
#    if cpg:
#        if 'CG' not in ref:
#            #print 'no informative'
#            l_degrad = 1.0
#            l_mut = 1.0
#    elif udg_half:
#        if 'CG' not in ref and ref[0] != 'C' and ref[-1] != 'G':
#            #print 'no informative'
#            l_degrad = 1.0
#            l_mut = 1.0
#    else:
#        if 'C' not in ref and 'G' not in ref:
#            #print 'no informative'
#            l_degrad = 1.0
#            l_mut = 1.0

    newquals = quals
    # ambiguous bases
    # TODO some of these may actually be informative!
    ambiguous = ('N', 'X', 'B', 'D', 'H', 'V', 'Y', 'R', 'W', 'S')

    qual_idx = 0
    for aln_idx, (readbase, refbase) in enumerate(zip(read, ref)):
        # skipped intron
        if readbase == '-' and refbase == 'X':
            continue
        # hard clipping
        elif readbase == refbase == 'X':
            continue
        # soft clipping
        elif refbase == 'X':
            qual_idx += 1
            continue
        # gaps in read
        elif readbase == '-':
            continue
        # gaps in reference
        elif refbase == '-':
            qual_idx += 1
            continue
        elif readbase in ambiguous or refbase in ambiguous:
            qual_idx += 1
            continue

        quality = quals[qual_idx]

        #######################################################################
        # At this point we're dealing with a qualified alignment where each   #
        # position has exactly                                                #
        # - one informative read base:        readbase       read[aln_idx]    #
        # - one informative reference base:   refbase        ref[aln_idx]     #
        # - one quality score:                quality        quals[qual_idx]  #
        #######################################################################

        i = qual_idx                     # distance from 5' end of read
        z = len(quals) - 1 - qual_idx    # distance from 3' end of read

        if adjustbaseq_all and adjustment_model_deam is not None:
            newprob = (adjustment_model_deam[i]
                       + adjustment_model_deam[z]
                       + phred2prob(ord(quality) - 33))
#            newprob = min(newprob, 1.0)
            newqual = prob2ascii(newprob)
            newquals = "%s%s%s" % (quals[0:i], newqual, quals[(i + 1):])

        if (ord(quality) - 33) < baseq:
            # make sure that quality is adjusted even if baseq is below
            # threshold
            if adjustbaseq:
                if refbase == 'C' and readbase == 'T':
#                    if cpg:
#                        if i + 1 >= readlen:
#                            break
#                        if ref[i + 1] != 'G':
#                            continue
#                    elif nocpg:
#                        if i + 1 >= readlen:
#                            break
#                        if ref[i + 1] == 'G':
#                            continue
#                    elif udg_half:
#                        if i + 1 >= readlen:
#                            break
#                        if ref[i + 1] != 'G' and i != 0:
#                            continue
                    newprob = adjust_quality(i, ancient_model_deam, quality)
                    newqual = prob2ascii(newprob)
                    newquals = "%s%s%s" % (
                        quals[0:i], newqual, quals[(i + 1):]
                    )

                elif refbase == 'G' and readbase == 'A':
#                    if cpg:
#                        if i - 1 >= readlen:
#                            break
#                        if ref[i - 1] != 'C':
#                            continue
#                    elif nocpg:
#                        if i - 1 >= readlen:
#                            break
#                        if ref[i - 1] == 'C':
#                            continue
#                    elif udg_half:
#                        if ref[i - 1] != 'C' and z != 0:
#                            continue
                    newprob = adjust_quality(z, ancient_model_deam, quality)
                    newqual = prob2ascii(newprob)
                    newquals = "%s%s%s" % (
                        quals[0:i], newqual, quals[(i + 1):]
                    )
            continue

#        if deamination:
#            if refbase == 'C':
#                if cpg:
#                    if i + 1 >= readlen:
#                        break
#                    if read[i + 1] != 'G':
#                        continue
#                elif nocpg:
#                    if i + 1 >= readlen:
#                        break
#                    if read[i + 1] == 'G':
#                        continue
#                elif udg_half:
#                    if i + 1 >= readlen:
#                        break
#                    if ref[i + 1] != 'G' and i != 0:
#                        continue
#
#                thekey = refbase + readbase + str(i)
#                if thekey in mismatch_dict.keys():
#                    addition = mismatch_dict[thekey]
#                    addition += 1
#                    mismatch_dict[thekey] = addition
#                else:
#                    mismatch_dict[thekey] = 1
#
#            if refbase == 'G':
#                if cpg:
#                    if i - 1 >= readlen:
#                        break
#                    if ref[i - 1] != 'C':
#                        continue
#                elif nocpg:
#                    if i - 1 >= readlen:
#                        break
#                    if ref[i - 1] == 'C':
#                        continue
#                elif udg_half:
#                    if i - 1 >= readlen:
#                        break
#                    if ref[i - 1] != 'C' and z != 0:
#                        continue
#                thekey = refbase + readbase + str(z)
#                if thekey in mismatch_dict_rev.keys():
#                    addition = mismatch_dict_rev[thekey]
#                    addition += 1
#                    mismatch_dict_rev[thekey] = addition
#                else:
#                    mismatch_dict_rev[thekey] = 1
#            continue

        # compute degradation score
#        if i >= readlen:continue
        if refbase == 'C':
#            if cpg:
#                if i + 1 >= readlen:
#                    break
#                if ref[i + 1] != 'G':
#                    continue
#            elif cpg:
#                if i + 1 >= readlen:
#                    break
#                if ref[i + 1] == 'G':
#                    continue
#            elif udg_half:
#                if i + 1 >= readlen:
#                    break
#                if ref[i + 1] != 'G' and i != 0:
#                    continue

            if readbase == 'T':
                l_degrad = l_degrad * likelihood_mismatch(
                    i, ancient_model_deam, quality, polymorphism_ancient)
                l_mut = l_mut * likelihood_mismatch(
                    i, modern_model_deam, quality, polymorphism_contamination)

                if adjustbaseq:
                    newprob = adjust_quality(i, ancient_model_deam, quality)
                    newqual = prob2ascii(newprob)

#                    print(phred2prob(ord(quality)), newprob)
#                    print(ord(quality), newphred)
#                    print(quality, newqual)
#                    print(quals[0:i], quality, quals[(i+1):])
#                    print(quals)

                    newquals = "%s%s%s" % (
                        quals[0:i], newqual, quals[(i + 1):]
                    )

            elif readbase == 'C':
                l_degrad = l_degrad * likelihood_match(
                    i, ancient_model_deam, quality, polymorphism_ancient)
                l_mut = l_mut * likelihood_match(
                    i, modern_model_deam, quality, polymorphism_contamination)

#            if pmds_prim and readbase in ['C', 'T', 'c', 't']:
#                l_degrad_max = l_degrad_max * likelihood_mismatch(
#                    i, ancient_model_deam, quality, polymorphism_ancient)
#                l_mut_max = l_mut_max * likelihood_mismatch(
#                    i, modern_model_deam, quality, polymorphism_ancient)
#                l_degrad_min = l_degrad_min * likelihood_match(
#                    i, ancient_model_deam, quality, polymorphism_ancient)
#                l_mut_min = l_mut_min * likelihood_match(
#                    i, modern_model_deam, quality, polymorphism_ancient)

        elif refbase == 'G':
            if cpg:
                if ref[i - 1] != 'C':
                    continue
            elif nocpg:
                if ref[i - 1] == 'C':
                    continue
            elif udg_half:
                if ref[i - 1] != 'C' and z != 0:
                    continue

            if readbase == 'A':
                l_degrad = l_degrad * likelihood_mismatch(
                    z, ancient_model_deam, quality, polymorphism_ancient)
                l_mut = l_mut * likelihood_mismatch(
                    z, modern_model_deam, quality, polymorphism_contamination)

                if adjustbaseq:
                    newprob = adjust_quality(z, ancient_model_deam, quality)
                    newqual = prob2ascii(newprob)
                    newquals = "%s%s%s" % (
                        quals[0:i], newqual, quals[(i + 1):]
                    )

            elif readbase == 'G':
                l_degrad = l_degrad * likelihood_match(
                    z, ancient_model_deam, quality, polymorphism_ancient)
                l_mut = l_mut * likelihood_match(
                    z, modern_model_deam, quality, polymorphism_contamination)

#            if pmds_prim and readbase in ['G', 'A', 'g', 'a']:
#                l_degrad_max = l_degrad_max * likelihood_mismatch(
#                    z, ancient_model_deam, quality, polymorphism_ancient)
#                l_mut_max = l_mut_max * likelihood_mismatch(
#                    z, modern_model_deam, quality, polymorphism_ancient)
#                l_degrad_min = l_degrad_min * likelihood_match(
#                    z, ancient_model_deam, quality, polymorphism_ancient)
#                l_mut_min = l_mut_min * likelihood_match(
#                    z, modern_model_deam, quality, polymorphism_ancient)

    # calculate log-likelihood ratio as final PMD score
    pmds = math.log(l_degrad / l_mut)

    return (l_degrad, l_mut, pmds, newquals)


[docs]def main():
    """Executable to calculate post-mortem degradation scores for SAM files.

    See ``--help`` for details on expected arguments. Takes input only on
    STDIN. Logs messages to STDERR and writes processed SAM file to
    STDOUT.
    """
    parser = argparse.ArgumentParser(
        usage=("python %(prog)s <SAM formatted data with MD field present "
               "from stdin> [options]")
    )

    # options to control PMD score calculation
    pmd_options = parser.add_argument_group(
        title="PMD parameters",
        description=("influence the geometric distribution used for PMD score "
                     "calculation by changing these parameters")
    )
    pmd_options.add_argument("--PMDpparam",
                             action="store",
                             type=float,
                             dest="PMDpparam",
                             help="parameter p in geometric probability \
                             distribution of PMD",
                             default=0.3)
    pmd_options.add_argument("--PMDconstant",
                             action="store",
                             type=float,
                             dest="PMDconstant",
                             help="constant C in geometric probability \
                             distribution of PMD",
                             default=0.01)
    pmd_options.add_argument("--polymorphism_ancient",
                             action="store",
                             type=float,
                             dest="polymorphism_ancient",
                             help="True biological polymorphism between the \
                             ancient individual and the reference sequence",
                             default=0.001)
    pmd_options.add_argument("--polymorphism_contamination",
                             action="store",
                             type=float,
                             dest="polymorphism_contamination",
                             help="True biological polymorphism between the \
                             contaminants and the reference sequence",
                             default=0.001)

    # options to filter SAM file
    filter_options = parser.add_argument_group(
        title="filters",
        description=("restrict analysis to a subset of reads by setting these "
                     "thresholds")
    )
    filter_options.add_argument("--dry",
                                action="store_true",
                                dest="dry",
                                help="print SAM input without any filters",
                                default=False)
    filter_options.add_argument("-n", "--number",
                                action="store",
                                type=int,
                                dest="maxreads",
                                help="stop after these many reads have been \
                                processed",
                                default=(10 ** 20))
    filter_options.add_argument("-c", "--chromosome",
                                action="store",
                                type=str,
                                dest="chromosome",
                                help="only process data from this chromosome",
                                default=False)
    filter_options.add_argument("--perc_identity",
                                type=float,
                                dest="perc_identity",
                                help="only output sequences with percent \
                                identity above this threshold",
                                default=0.0)
    filter_options.add_argument("-m", "--requiremapq",
                                action="store",
                                type=int,
                                dest="mapq",
                                help="only process sequences with mapping \
                                quality at least this great",
                                default=0)
    filter_options.add_argument("--readlength",
                                action="store",
                                type=int,
                                dest="readlength",
                                help="only process sequences with this read \
                                length",
                                default=0)
    filter_options.add_argument("--maxlength",
                                action="store",
                                type=int,
                                dest="maxlength",
                                help="only process sequences with max this \
                                read length",
                                default=300)
    filter_options.add_argument("--minlength",
                                action="store",
                                type=int,
                                dest="minlength",
                                help="only process sequences with min this \
                                read length",
                                default=0)
    filter_options.add_argument("--maxGC",
                                action="store",
                                type=float,
                                dest="maxGC",
                                help="only process sequences with max this \
                                GC content of the aligning reference sequence",
                                default=1.0)
    filter_options.add_argument("--minGC",
                                action="store",
                                type=float,
                                dest="minGC",
                                help="only process sequences with min this GC \
                                content of the aligning reference sequence",
                                default=0.0)
    filter_options.add_argument("-q", "--requirebaseq",
                                action="store",
                                type=int,
                                dest="baseq",
                                help="only process bases with base quality at \
                                least this great",
                                default=0)
    filter_options.add_argument("-t", "--threshold",
                                type=float,
                                dest="threshold",
                                help="only output sequences with PMD score \
                                above this threshold",
                                default=(-20000.0))
    filter_options.add_argument("--upperthreshold",
                                type=float,
                                dest="upperthreshold",
                                help="only output sequences with PMD score \
                                below this threshold",
                                default=(1000000.0))

    # options controlling handling of CpG contexts
    cpg_options = parser.add_argument_group(
        title="CpG context",
        description=("options controlling the handling of deamination in CpG "
                     "contexts")
    )
    # --CpG, --noCpG and --UDGhalf exclude each other
    cpg_excl = cpg_options.add_mutually_exclusive_group(required=False)
    cpg_excl.add_argument("--CpG", "--UDGplus",
                          action="store_true",
                          dest="cpg",
                          help="only use Cs and Gs in CpG context",
                          default=False)
    cpg_excl.add_argument("--noCpG",
                          action="store_true",
                          dest="nocpg",
                          help="dont use Cs and Gs in CpG context",
                          default=False)
    cpg_excl.add_argument("--UDGhalf",
                          action="store_true",
                          dest="udg_half",
                          help=("only use Cs and Gs in CpG context, the first "
                                "and last base are used regardless of "
                                "dinucleotide context"),
                          default=False)
#    cpg_options.add_argument("--UDGplus",
#                             action="store_true",
#                             dest="UDGplus",
#                             help="only use Cs and Gs in CpG context \
#                             (synonymous with option --CpG)",
#                             default=False)
    cpg_options.add_argument("--UDGminus",
                             action="store_true",
                             dest="UDGminus",
                             help="use all bases (placeholder)",
                             default=False)

    # options controlling output
    output_options = parser.add_argument_group(
        title="output",
        description="control type and detail of output"
    )
    output_options.add_argument("--verbose",
                                action="store_true",
                                dest="verbose",
                                help="verbose",
                                default=False)
    output_options.add_argument("--header",
                                action="store_true",
                                dest="header",
                                help="output the SAM header",
                                default=False)
    output_options.add_argument("--writesamfield",
                                action="store_true",
                                dest="writesamfield",
                                help="add 'DS:Z:<PMDS>' field to SAM output, \
                                will overwrite if already present",
                                default=False)
    output_options.add_argument("--stats",
                                action="store_true",
                                dest="stats",
                                help="output summarizing statistics to stderr",
                                default=False)
    output_options.add_argument("--debug",
                                action="store_true",
                                dest="debug",
                                help="show additional information for \
                                debugging",
                                default=False)
    output_options.add_argument("-p", "--printDS",
                                action="store_true",
                                dest="printDS",
                                help="print PMD scores",
                                default=False)
    output_options.add_argument("--printalignments",
                                action="store_true",
                                dest="printalignments",
                                help="print human readable alignments",
                                default=False)
    output_options.add_argument("--platypus",
                                action="store_true",
                                dest="platypus",
                                help="output big list of base frequencies for \
                                platypus",
                                default=False)
    output_options.add_argument("-d", "--deamination",
                                action="store_true",
                                dest="deamination",
                                help="output base frequencies in the read at \
                                positions where there are C or G in the \
                                reference",
                                default=False)

    # options to modify calculation of percent identity
    perc_id_options = parser.add_argument_group(
        title="percent identity",
        description="options to modify calculation of percent identity"
    )
    perc_id_options.add_argument("--include_deamination",
                                 action="store_true",
                                 dest="include_deamination",
                                 help="treat possibly deaminated T>C and A>G \
                                 pairs as mismatches in calculation of \
                                 percent identity",
                                 default=False)
    perc_id_options.add_argument("--include_indels",
                                 action="store_true",
                                 dest="include_indels",
                                 help="treat insertions and deletions as \
                                 mismatches in calculation of percent \
                                 identity",
                                 default=False)
    perc_id_options.add_argument("--include_unknown",
                                 action="store_true",
                                 dest="include_unknown",
                                 help="treat Ns in either read or reference \
                                 as mismatch in calculation of percent \
                                 identity",
                                 default=False)

    # remaining options
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s v0.55 (modified)')
    parser.add_argument("--PMDSprim",
                        action="store_true",
                        dest="pmds_prim",
                        help="PMDSprim",
                        default=False)
    parser.add_argument("--PMDSprimthreshold",
                        action="store",
                        type=float,
                        dest="PMDSprimthreshold",
                        help="PMDSprimthreshold",
                        default=False)
    parser.add_argument("--first",
                        action="store_true",
                        dest="first",
                        help="outputs the deamination rate at the first \
                        position only, but with a standard error",
                        default=False)
    parser.add_argument("--range",
                        action="store",
                        type=int,
                        dest="range",
                        help="output deamination patterns for this many \
                        positions from the sequence terminus (default=30)",
                        default=30)
    parser.add_argument("--noclips",
                        action="store_true",
                        dest="noclips",
                        help="no clips",
                        default=False)
    parser.add_argument("--noindels",
                        action="store_true",
                        dest="noindels",
                        help="no indels",
                        default=False)
    parser.add_argument("--onlyclips",
                        action="store_true",
                        dest="onlyclips",
                        help="only clips",
                        default=False)
    parser.add_argument("--onlydeletions",
                        action="store_true",
                        dest="onlydeletions",
                        help="only deletions",
                        default=False)
    parser.add_argument("--onlyinsertions",
                        action="store_true",
                        dest="onlyinsertions",
                        help="only insertions",
                        default=False)
    parser.add_argument("--nodeletions",
                        action="store_true",
                        dest="nodeletions",
                        help="no deletions",
                        default=False)
    parser.add_argument("--noinsertions",
                        action="store_true",
                        dest="noinsertions",
                        help="no insertions",
                        default=False)
    parser.add_argument("--notreverse",
                        action="store_true",
                        dest="notreverse",
                        help="no reverse complement alignments",
                        default=False)
    parser.add_argument("-a", "--adjustbaseq",
                        action="store_true",
                        dest="adjustbaseq",
                        help="apply PMD-aware adjustment of base quality \
                        scores specific to C>T and G>A mismatches to the \
                        reference",
                        default=False)
    parser.add_argument("--adjustbaseq_all",
                        action="store_true",
                        dest="adjustbaseq_all",
                        help="apply PMD-aware adjustment of base quality \
                        scores regardless of observed bases",
                        default=False)
    parser.add_argument("--samtoolspath",
                        action="store",
                        dest="samtoolspath",
                        help="full path to samtools",
                        default='samtools')
    parser.add_argument("--basecomposition",
                        action="store_true",
                        dest="basecomposition",
                        help="basecomposition",
                        default=False)
    parser.add_argument("-r", "--refseq",
                        action="store",
                        dest="refseq",
                        help="refseq",
                        default=False)
    parser.add_argument("--estimate",
                        action="store_true",
                        dest="estimate",
                        help="two-terminus estimate of contamination",
                        default=False)
    parser.add_argument("--estimatebase",
                        action="store",
                        type=int,
                        dest="estimatebase",
                        help="position of base used fortwo-terminus estimate \
                        of contamination",
                        default=0)
    parser.add_argument("-b", "--basic",
                        action="store",
                        type=int,
                        dest="basic",
                        help="only output reads with a C>T mismatch this \
                        many basepairs from the 5' end",
                        default=0)

    (options, args) = parser.parse_known_args()

#    if options.UDGplus:
#        options.CpG = True

#    maxlen = options.maxlength

    # pre-compute probabilities of deamination for 1000 bases under modern and
    # ancient models
    modern_model_deam = [i for i in repeat(0.001, 1000)]
    ancient_model_deam = [geometric(options.PMDpparam, i, options.PMDconstant)
                          for i in range(1, 1000)]
    adjustment_model_deam = None
    if options.adjustbaseq_all:
        # constant is 0.0 here in contrast to model used to compute PMD scores
        adjustment_model_deam = [geometric(options.PMDpparam, i, 0.0)
                                 for i in range(1, 1000)]

    # base composition
#    start_count = 0
#    rev_start_count = 0
#    not_counted = 0
#    imperfect = 0

#    start_dict= {}
#    start_dict_rev = {}
    mismatch_dict = {}
    mismatch_dict_rev = {}
#    mismatch_dict_CpG = {}
#    mismatch_dict_CpG_rev = {}

    first_c = 0
    first_t = 0

    clipexcluded = 0
    indelexcluded = 0
    cigarextexcluded = 0  # reads excluded for extended CIGAR (N, H, P, not S)
    identityexcluded = 0  # reads excluded due to percent identity filter
    md_missing = 0
    no_gc_excluded = 0
    excluded_threshold = 0
    passed = 0
    noquals = 0
#    maskings = 0
#
#    CCandCC = 0
#    CTandCC = 0
#    CCandCT = 0
#    CTandCT = 0
#
#    estimator_list = []

    composition_dict = {}
    composition_dict_rev = {}

    line_counter = 0
    for line in sys.stdin:
        if '@' in line[0]:
            if options.header:
                print(line.rstrip('\n'))
            continue
        line_counter += 1
        line = line.rstrip('\n')
        col = line.split('\t')

        if options.debug:
            print(col)

#        readname = col[0]
        position = int(col[3])
        chromosome = col[2]

        if options.chromosome:
            if chromosome != options.chromosome:
                continue
        mapq = int(col[4])
        read = col[9]
        readlen = len(read)
        quals = col[10]
        flag = col[1]
        position = int(col[3])
        cigar = col[5]

        if len(quals) < 2:
            noquals += 1
            continue

        if options.noinsertions:
            if 'I' in cigar:
                continue
        if options.nodeletions:
            if 'D' in cigar:
                continue
        if options.onlyinsertions:
            if 'I' not in cigar:
                continue
        if options.onlydeletions:
            if 'D' not in cigar:
                continue
        if options.noindels:
            if 'I' in cigar or 'D' in cigar:
                indelexcluded += 1
                continue
        if options.noclips:
            if 'S' in cigar or 'H' in cigar or 'N' in cigar or 'P' in cigar:
                clipexcluded += 1
                continue
        if options.onlyclips:
            if 'S' not in cigar:
                continue
        if 'H' in cigar or 'P' in cigar or 'N' in cigar:
            print("cigar found: %s" % (cigar,),
                  "PMDtools only supports cigar operations M, I, S and D,",
                  "the alignment has been excluded",
                  file=sys.stderr)
            cigarextexcluded += 1
            continue
        if mapq < options.mapq:
            continue

        # read length filter
        # TODO check for default values unnecessary, just remove defaults and
        # check for args directly
        if options.readlength > 0:
            if options.readlength != len(read):
                continue
        if options.minlength > 0:
            if options.minlength > len(read):
                continue
        if options.maxlength != 300:
            if options.maxlength < len(read):
                continue

        # chromosome filter
        if options.chromosome:
            if chromosome != options.chromosome:
                continue

        # check orientation of read
        if flag.isdigit():
            if int(flag) & 16:
                reverse = True
            else:
                reverse = False
        else:
            if 'r' in flag:
                reverse = True
            else:
                reverse = False

        # filter reverse reads
        if options.notreverse:
            if reverse:
                continue

        # check for previously calculated PMD score
        ds_field = False
        if 'DS:Z:' in line:
            ds_field = True
            pmds = float(line.split('DS:Z:')[1].rstrip('\n').split()[0])
            l_ratio = pmds

        # Recreate reference sequence from MD field
        if (ds_field is False or options.writesamfield or options.basic > 0
                or options.perc_identity > 0.01 or options.printalignments
                or options.adjustbaseq or options.adjustbaseq_all
                or options.deamination or options.dry or options.estimate
                or options.first):
            read = col[9]
            try:
                md_field = line.split('MD:Z:')[1].split()[0].rstrip('\n')
            except IndexError:
                md_missing += 1
                continue

            # using external library to reconstruct reference from CIGAR and MD
            rec = reconstruct(read, cigar, md_field)
            (rec_ref_full, rec_read_full, rec_ref_aln, rec_read_aln) = rec

            if reverse:
                read = reverse_complement(read)
                # don't use raw read beyond here...
                rec_ref_full = reverse_complement(rec_ref_full)
                rec_read_full = reverse_complement(rec_read_full)
                rec_ref_aln = reverse_complement(rec_ref_aln)
                rec_read_aln = reverse_complement(rec_read_aln)
                quals = quals[::-1]

            # this assumption previously lead to trouble because of missing
            # gaps in aligned read!
            real_read = read
            real_ref_seq = rec_ref_aln

        # debugging the reference sequence reconstruction
        if options.debug:
            print("%s\tread\n%s\tref" % (real_read, real_ref_seq))
            print("CIGAR:\t%s" % (cigar,))
            print("MD:\t%s" % (md_field,))

        # GC content filter
        if options.maxGC < 1.0 or options.minGC > 0.0:
            gc_content = gc(real_ref_seq)
            if gc_content > options.maxGC or gc_content < options.minGC:
                continue

        # test for empty reference sequence TODO include in reconstruction
        # method and throw exception?
        if ('G' not in real_ref_seq
                and 'C' not in real_ref_seq
                and 'T' not in real_ref_seq
                and 'A' not in real_ref_seq):
            print('bad reference sequence reconstruction: %s' % real_ref_seq,
                  file=sys.stderr)
            print('SAM line: %s' % (line,))
            exit(1)

        if options.basecomposition:
            backoffset = 10
            if reverse:
                endpos = position
                startpos = position + len(real_read)
            else:
                startpos = position
                endpos = position + len(real_read)

            # 5' end
            largerefseq = fa_get(options.refseq,
                                 chromosome,
                                 startpos - backoffset,
                                 startpos + options.range,
                                 options.samtoolspath)
            if len(largerefseq) < 1:
                continue
#            print largerefseq
            if reverse:
                largerefseq = reverse_complement(largerefseq)
#            print(largerefseq, real_read)
            for i in range(-backoffset, options.range):
#                print(i+backoffset, len(largerefseq))
                base = largerefseq[min([i + backoffset, len(largerefseq)])]
                thekey = '5' + base + str(i)
                if thekey in composition_dict.keys():
                    addition = composition_dict[thekey]
                    addition += 1
                    composition_dict[thekey] = addition
                else:
                    composition_dict[thekey] = 1

            # 3' end
            largerefseq = fa_get(options.refseq,
                                 chromosome,
                                 endpos - options.range,
                                 endpos + backoffset,
                                 options.samtoolspath)
            if len(largerefseq) < 1:
                continue
            if reverse:
                largerefseq = reverse_complement(largerefseq)

            for i in range(-backoffset, options.range):
                base = largerefseq[min([i + backoffset, len(largerefseq)])]
                thekey = '3' + base + str(i)
                if thekey in composition_dict_rev.keys():
                    addition = composition_dict_rev[thekey]
                    addition += 1
                    composition_dict_rev[thekey] = addition
                else:
                    composition_dict_rev[thekey] = 1
            continue    # useless?

        # basic filter
        # prints the SAM line if a C>T mismatch with sufficient base quality is
        # observed in the first n bases, where n is specified
        if options.basic > 0:
            for read, ref, pos in zip(real_read, real_ref_seq,
                                      range(0, len(real_ref_seq))):
                if read == 'N':
                    break
                elif ref == 'N':
                    break
                elif read == '-':
                    break
                elif ref == '-':
                    break
                i = pos                     # - start_position
                if i >= readlen:            # 20
                    break
                if i >= options.basic:
                    break
                if options.cpg:
                    if (ref == 'C' and read == 'T'
                            and ord(quals[i] - 33) > options.baseq
                            and real_ref_seq[i + 1] == 'G'):
                        print(line.rstrip('\n'))
                        break

                elif (ref == 'C' and read == 'T'
                      and ord(quals[i]) - 33 > options.baseq):
                    print(line.rstrip('\n'))
                    break

        # first base
        # prints the deamination rate at the first base and a standard error
        # computed by jackknife over reads
        if options.first:
            ref = real_ref_seq[0]
            read = real_read[0]
            if options.cpg:
                if (ref == 'C' and read == 'T'
                        and (ord(quals[0])-33) > options.baseq
                        and real_ref_seq[1] == 'G'):
                    first_t += 1
                if (ref == 'C' and read == 'C'
                        and (ord(quals[0])-33) > options.baseq
                        and real_ref_seq[1] == 'G'):
                    first_c += 1
            else:
                if (ref == 'C' and read == 'T'
                        and ord(quals[0]) - 33 > options.baseq):
                    first_t += 1
                if (ref == 'C' and read == 'C'
                        and ord(quals[0]) - 33 > options.baseq):
                    first_c += 1

        if options.perc_identity > 0.01 or options.printalignments:
            # divergence filter
            (perc_identity, mismatch_string) = aln_identity(
                rec_read_aln,
                rec_ref_aln,
                options.include_indels,
                options.include_deamination,
                options.include_unknown
            )
            if perc_identity < options.perc_identity:
                identityexcluded += 1
                continue

        # start PMD score computations
        if (ds_field is False
                or (ds_field is True and options.writesamfield is True)
                or options.basic > 0
                or options.adjustbaseq
                or options.adjustbaseq_all
                or options.deamination
                or options.dry):
            (l_degrad, l_mut, l_ratio, newquals) = score_pmd(
                rec_read_full,  # was: real_read
                rec_ref_full,   # was: real_ref_seq
                quals,
                ancient_model_deam,
                modern_model_deam,
                adjustment_model_deam,
                options.polymorphism_contamination,
                options.polymorphism_ancient,
                options.adjustbaseq_all,
                options.adjustbaseq,
                options.baseq
            )
            # TODO re-enable these options step by step...
#                options.deamination,
#                options.cpg,
#                options.nocpg,
#                options.udg_half,
#                options.pmds_prim)

#            if options.pmds_prim:
#                maxPMDSval = math.log(l_degrad_max / l_mut_max)
#                maxPMDSval = maxPMDSval / readlen
#                if l_ratio > 0.0:
#                    LRnumerator = math.log(l_degrad_max / l_mut_max)
#                elif l_ratio < 0.0:
#                    LRnumerator = math.log(l_degrad_max / l_mut_max)
#                if options.PMDSprimthreshold:
#                    if maxPMDSval < options.PMDSprimthreshold:
#                        continue
#
#                if options.printDS:
#                    print(l_ratio, maxPMDSval, maxPMDSval,
#                          maxPMDSval * readlen, readlen)
##                l_ratio = l_ratio / LRnumerator
#            quals = newquals

        if options.adjustbaseq:
            if reverse:
                qualsp = quals[::-1]
            else:
                qualsp = quals
            line = ('\t'.join(col[0:10]) + '\t'
                    + qualsp + '\t'
                    + '\t'.join(col[11:]))

        # add PMDS tag
        if options.writesamfield is True:
            # remove DS field if present
            if ds_field is True:
                newline = ''
                for col in line.split('\t'):
                    if 'DS:Z:' in col:
                        continue
                    else:
                        newline += col + '\t'
                line = newline.rstrip('\t')

            line = line.rstrip('\n') + '\t' + 'DS:Z:' + str(round(l_ratio, 3))

        if options.printDS:
            print(l_degrad, '\t', l_mut, '\t', l_degrad / l_mut, '\t', l_ratio)
#            print(l_degrad, '\t', l_mut, '\t', l_degrad / l_mut, '\t',
#                  l_ratio, '\t',
#                  readlen, '\t',
#                  perc_identity, '\t',
#                  perc_identity * math.log(l_degrad / l_mut))

        if options.dry:
            if len(line) < 1:
                continue
            print(line.rstrip('\n'))
            continue

        if options.threshold > (-10000) or options.upperthreshold < (1000000):
            if (l_ratio >= options.threshold
                    and l_ratio < options.upperthreshold):
                print(line.rstrip('\n'))
            else:
                excluded_threshold += 1

        if options.printalignments:
            if (options.threshold > -10000
                    or options.upperthreshold < 1000000):
                try:
                    l_ratio = math.log(l_degrad / l_mut)
                except:
                    continue
                if (l_ratio < options.threshold
                        or l_ratio > options.upperthreshold < 1000000):
                    continue

            quals1 = ''
            quals2 = ''
            for qual in quals:
                qnum = ord(qual) - 33
                if qnum < 10:
                    quals1 += '0'
                    quals2 += str(qnum)
                else:
                    quals1 += str(qnum)[0]
                    quals2 += str(qnum)[1]
#            print(md_field, cigar, reverse)
#            print(col[9])
            print(real_read)
            print(mismatch_string)
            print(real_ref_seq)
            print(quals)
#            print(quals1)
#            print(quals2)
#            print(col[10])
            print('')
        passed += 1
        if passed >= options.maxreads:
            break

    if options.first:
        first_ct = first_c + first_t
        freq = 1.0 * first_t / first_ct
        se = math.sqrt((freq * (1.0 - freq)) / first_ct)
        if freq == 0.0:
            se = 'NA'
        print('C>T at first position and SE:', freq, '\t', se)
#        print('C>T at first position and SE:', freq, '\t', se, '\t',
#              n, first_c, first_t)

    if options.stats:
        print('""""""""""""""""""""""""""""""""', file=sys.stderr)

        # added stats
        print("excluded due to extended CIGAR (H, N, P):\t%i"
              % (cigarextexcluded,), file=sys.stderr)
        print("excluded due percent identity filter:\t%i"
              % (identityexcluded,), file=sys.stderr)

        print("excluded due to clipping:\t%i" % (clipexcluded,),
              file=sys.stderr)
        print("excluded due to indels:\t%i" % (indelexcluded,),
              file=sys.stderr)
        print("no MD field:\t%i" % (md_missing,), file=sys.stderr)
        print("no G or C in ref:\t%i" % (no_gc_excluded,), file=sys.stderr)
        print("total seqs:\t%i" % (passed,), file=sys.stderr)
        print("excluded due to PMD score < %i:\t%i"
              % (int(options.threshold), excluded_threshold), file=sys.stderr)
        print("passed seqs:\t%i" % (passed - excluded_threshold,),
              file=sys.stderr)

        print('""""""""""""""""""""""""""""""""', file=sys.stderr)

    if options.deamination:
        if True:
            pairs = ['CT', 'CA', 'CG', 'CC', 'GA', 'GT', 'GC', 'GG']
            itotaldict = {}
            ztotaldict = {}
            for i in range(0, options.range):
                itotal = 0
                ztotal = 0
                for pair in pairs:
                    thekey = pair + str(i)
                    try:
                        itotal += mismatch_dict[thekey]
                    except KeyError:
                        pass
                    try:
                        ztotal += mismatch_dict_rev[thekey]
                    except KeyError:
                        pass
                itotaldict[i] = itotal
                ztotaldict[i] = ztotal

            print('z\t', '\t'.join(pairs))

            for i in range(0, options.range):
                print(str(i) + '\t')
                for pair in pairs:
                    thekey = pair + str(i)
                    if 'C' in pair[0]:
                        try:
                            thecount = mismatch_dict[thekey]
                        except KeyError:
                            print('0.00000\t')
                            continue
                        thetotal = itotaldict[i]
                        frac = 1.0 * thecount / thetotal
                    if 'G' in pair[0]:
                        try:
                            thecount = mismatch_dict_rev[thekey]
                        except KeyError:
                            print('0.00000\t')
                            continue
                        thetotal = ztotaldict[i]
                        frac = 1.0 * thecount / thetotal
                    print(str(round(frac, 5)) + '\t')
                print('')

    if options.basecomposition:
        print(composition_dict)
        print(composition_dict_rev)
        if True:
            pairs = ['5T', '5A', '5G', '5C', '3T', '3A', '3G', '3C']
            itotaldict = {}
            ztotaldict = {}
            for i in range(-backoffset, options.range):
                itotal = 0
                ztotal = 0
                for pair in pairs:
                    thekey = pair + str(i)
                    try:
                        itotal += composition_dict[thekey]
                    except KeyError:
                        pass
                    try:
                        ztotal += composition_dict_rev[thekey]
                    except KeyError:
                        pass
                itotaldict[i] = itotal
                ztotaldict[i] = ztotal

            print('z\t', '\t'.join(pairs))

            for i in range(-backoffset, options.range):
                print(str(i) + '\t')
                for pair in pairs:
                    thekey = pair + str(i)
                    if '5' in pair[0]:
                        try:
                            thecount = composition_dict[thekey]
                        except KeyError:
                            print('0.00000\t')
                            continue
                        thetotal = itotaldict[i]
                        frac = 1.0 * thecount / thetotal
                    if '3' in pair[0]:
                        try:
                            thecount = composition_dict_rev[thekey]
                        except KeyError:
                            print('0.00000\t')
                            continue
                        thetotal = ztotaldict[i]
                        frac = 1.0 * thecount / thetotal
                    print(str(round(frac, 5)) + '\t')
                print('')

    exit()

if __name__ == "__main__":
    main()
Navigation

Source code for pmdtools.pmdtools_mod

Quick search

Navigation