Source code for pmdtools.pmdtools_mod

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Calculate and filter post-mortem degeneration score for NGS reads in SAM files.

Modified version of PMDtools based on v0.55 by Pontus Skoglund:

cite: P Skoglund, BH Northoff, MV Shunkov, AP Derevianko, S Paabo, J Krause, M
Jakobsson (2014) Separating endogenous ancient DNA from modern day
contamination in a Siberian Neandertal, PNAS, advance online 27 January

Included changes:

* count and show excluded reads due to unsupported CIGAR operations
* count and show excluded reads due to % identity filtering
* port from Python2 to Python3
* bugfix: ambiguous combinations of CIGAR and MD with deletions lead to shifts
  in reconstructed reference sequence
* support for all CIGAR operations except padded references (untested)
* support of IUPAC ambiguity codes
* optional parameters to adapt % identity calculation to MALT defaults
* refactoring into main method
* use of ArgumentParser over deprecated OptionParser
* improved PEP8 and pylint compliance
* ReST documentation

.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""

import sys
import argparse
import math
import subprocess
from itertools import repeat

# custom libraries
from samsifter.util.sam import reconstruct
from samsifter.util.genetics import reverse_complement, gc, aln_identity


[docs]def phred2prob(quality): """Convert PHRED score to probability of sequencing error. Parameters ---------- quality : int PHRED score. Returns ------- float The probability of a sequencing error. """ return 10.0 ** (-quality / 10.0)
[docs]def prob2phred(probability): """Convert probability of sequencing error to PHRED score. Parameters ---------- probability : float The probability of a sequencing error. Returns ------- float PHRED score. """ return -10.0 * math.log(probability, 10)
[docs]def prob2ascii(probability, offset=33): """Convert probability of sequencing error to ASCII-encoded PHRED score. By default uses ASCII offset 33 (Illumina standard). Parameters ---------- probability : float The probability of a sequencing error. offset : int, optional ASCII encoding offset, defaults to 33. """ return chr(int(prob2phred(probability)) + offset)
[docs]def prob(pos, model, qual, poly, match=True): """Calculate probability of either match or mismatch under given model. Expects distance of base from end of read, a pre-computed distribution of damage probabilities, PHRED score of base (probability of sequencing error) and probability of true polymorphism. Parameters ---------- pos : int Position of base from end of read. model : list Distribution of damage probabilities. qual : int PHRED score of base. poly : float Probability of a true polymorphism at this base. match : bool, optional Switch between calculation for a match (default) or mismatch. Returns ------- float Probability of either match or mismatch under given model. """ p_damage = float(model[pos]) p_error = phred2prob((ord(qual) - 33)) / 3.0 p_poly = poly p_match = ((1.0 - p_damage) * (1.0 - p_error) * (1.0 - p_poly) + (p_damage * p_error * (1.0 - p_poly)) + (p_error * p_poly * (1.0 - p_damage))) if match: return p_match else: return 1.0 - p_match
[docs]def likelihood_match(position, model, qual, poly): """Calculate likelihood of a match under given model. Parameters ---------- pos : int Position of base from end of read. model : list Distribution of damage probabilities. qual : int PHRED score of base. poly : float Probability of a true polymorphism at this base. Returns ------- float Probability of match under given model. """ return prob(position, model, qual, poly, True)
[docs]def likelihood_mismatch(position, model, qual, poly): """Calculate likelihood of a mismatch under given model. Parameters ---------- pos : int Position of base from end of read. model : list Distribution of damage probabilities. qual : int PHRED score of base. poly : float Probability of a true polymorphism at this base. Returns ------- float Probability of mismatch under given model. """ return prob(position, model, qual, poly, False)
[docs]def adjust_quality(position, model, qual): """Adjust base quality value (sequencing error probability). Parameters ---------- position : int Position of base from end of read. model : list Distribution of damage probabilities. qual : int PHRED score of base. Returns ------- float Adjusted base quality value. """ p_damage = float(model[position]) p_error = phred2prob((ord(qual) - 33)) / 3.0 adjusted = 1.0 - ((1.0 - p_damage) * (1.0 - p_error)) return adjusted
[docs]def geometric(pval, kval, constant): """Calculate geometrically distributed probability. Parameters ---------- pval : float Parameter p of geometric distribution. kval : float Parameter k of geometric distribution. constant : float Constant value of geometric distribution. Returns ------- float Probability of deamination based on geometric distribution. """ return ((1.0 - pval) ** (kval - 1)) * pval + constant
[docs]def fa_get(ffile, chrom, fstart, fend, samtoolspath, verbose=False): """Unused method stub to get reference sequence from supplied FASTA? Calls SAMtools in subprocess to create index of FASTA file. Parameters ---------- ffile : str Path to FASTA file. chrom : int Chromosome number. fstart : int Start position. fend : int End position. samtoolspath : str Path to samtools. verbose : bool, optional Enable output of executed commandline, defaults to False. Returns ------- str reference sequence """ fnull = open('/dev/null', 'w') chrom = str(chrom) # 'chr'+str(chrom) location = str(chrom) + ':' + str(fstart) + '-' + str(fend) cmd_line = [samtoolspath, 'faidx', ffile, location] outp_file = subprocess.Popen(cmd_line, stdout=subprocess.PIPE, stderr=fnull) pileupline = outp_file.stdout.read().split() pileupline = ''.join(pileupline[1:]) if len(pileupline) < 1: print('no such reference sequence', cmd_line) if verbose: print(' '.join(cmd_line)) return pileupline
[docs]def score_pmd(read, ref, quals, ancient_model_deam, modern_model_deam, adjustment_model_deam=None, polymorphism_contamination=0.001, polymorphism_ancient=0.001, adjustbaseq_all=False, adjustbaseq=False, baseq=0, deamination=False, cpg=False, nocpg=False, udg_half=False, pmds_prim=False): """Calculate post-mortem degradation score (PMDS). Requires full read and reference sequence including clips, skips and gaps as recovered from CIGAR and MD to use the correct distance from 5' or 3' end in the PMDS calculation. However only the aligned read bases with PHRED scores are considered for PMD scoring. Optional adjustment of base qualities requires additional adjustment model. Parameters ---------- read : str Full read sequence. ref : str Aligned reference sequence including clips, skips and gaps. quals : list Quality scores for aligned bases. ancient_model_deam : list Probability distribution for ancient deamination. modern_model_deam : list Probability distribution for modern deamination. adjustment_model_deam : list, optional Adjustments to the deamination probabilities, defaults to None. polymorphism_contamination : float, optional Probability of polymorphism due to contamination, defaults to 0.001. polymorphism_ancient : float, optional Probability of polymorphism due to age, defaults to 0.001. adjustbaseq_all : bool, optional Switch to adjust all base quality values, defaults to False. adjustbaseq : bool, optional Switch to adjust affected base quality values, defaults to False. baseq : int, optional Minimum quality value for processed bases, defaults to 0 deamination : bool, optional Enable output of base frequencies in deaminated context, defaults to False. cpg : bool, optional Only use Cs and Gs in CpG context, defaults to False. nocpg : bool, optional Do NOT use Cs and Gs in CpG context, defaults to False. udg_half : bool, optional Only use Cs and Gs in CpG context, the first and last base are used regardless of dinucleotide context. Defaults to False. pmds_prim : bool, optional Defaults to False. Returns ------- float Likelihood of deamination under ancient DNA model. float Likelihood of deamination under ancient DNA model. float Post-mortem degradation score list Adjusted base qualities. """ # this should never happen if reference was correctly reconstructed if len(read) != len(ref): raise ValueError("Undefined for sequences of unequal length") l_degrad = 1.0 # likelihood of degradation l_mut = 1.0 # likelihood of mutation # l_degrad_max = 1.0 # l_mut_max = 1.0 # # l_degrad_min = 1.0 # l_mut_min = 1.0 # # Check for informative sites (does nothing at the moment) # if cpg: # if 'CG' not in ref: # #print 'no informative' # l_degrad = 1.0 # l_mut = 1.0 # elif udg_half: # if 'CG' not in ref and ref[0] != 'C' and ref[-1] != 'G': # #print 'no informative' # l_degrad = 1.0 # l_mut = 1.0 # else: # if 'C' not in ref and 'G' not in ref: # #print 'no informative' # l_degrad = 1.0 # l_mut = 1.0 newquals = quals # ambiguous bases # TODO some of these may actually be informative! ambiguous = ('N', 'X', 'B', 'D', 'H', 'V', 'Y', 'R', 'W', 'S') qual_idx = 0 for aln_idx, (readbase, refbase) in enumerate(zip(read, ref)): # skipped intron if readbase == '-' and refbase == 'X': continue # hard clipping elif readbase == refbase == 'X': continue # soft clipping elif refbase == 'X': qual_idx += 1 continue # gaps in read elif readbase == '-': continue # gaps in reference elif refbase == '-': qual_idx += 1 continue elif readbase in ambiguous or refbase in ambiguous: qual_idx += 1 continue quality = quals[qual_idx] ####################################################################### # At this point we're dealing with a qualified alignment where each # # position has exactly # # - one informative read base: readbase read[aln_idx] # # - one informative reference base: refbase ref[aln_idx] # # - one quality score: quality quals[qual_idx] # ####################################################################### i = qual_idx # distance from 5' end of read z = len(quals) - 1 - qual_idx # distance from 3' end of read if adjustbaseq_all and adjustment_model_deam is not None: newprob = (adjustment_model_deam[i] + adjustment_model_deam[z] + phred2prob(ord(quality) - 33)) # newprob = min(newprob, 1.0) newqual = prob2ascii(newprob) newquals = "%s%s%s" % (quals[0:i], newqual, quals[(i + 1):]) if (ord(quality) - 33) < baseq: # make sure that quality is adjusted even if baseq is below # threshold if adjustbaseq: if refbase == 'C' and readbase == 'T': # if cpg: # if i + 1 >= readlen: # break # if ref[i + 1] != 'G': # continue # elif nocpg: # if i + 1 >= readlen: # break # if ref[i + 1] == 'G': # continue # elif udg_half: # if i + 1 >= readlen: # break # if ref[i + 1] != 'G' and i != 0: # continue newprob = adjust_quality(i, ancient_model_deam, quality) newqual = prob2ascii(newprob) newquals = "%s%s%s" % ( quals[0:i], newqual, quals[(i + 1):] ) elif refbase == 'G' and readbase == 'A': # if cpg: # if i - 1 >= readlen: # break # if ref[i - 1] != 'C': # continue # elif nocpg: # if i - 1 >= readlen: # break # if ref[i - 1] == 'C': # continue # elif udg_half: # if ref[i - 1] != 'C' and z != 0: # continue newprob = adjust_quality(z, ancient_model_deam, quality) newqual = prob2ascii(newprob) newquals = "%s%s%s" % ( quals[0:i], newqual, quals[(i + 1):] ) continue # if deamination: # if refbase == 'C': # if cpg: # if i + 1 >= readlen: # break # if read[i + 1] != 'G': # continue # elif nocpg: # if i + 1 >= readlen: # break # if read[i + 1] == 'G': # continue # elif udg_half: # if i + 1 >= readlen: # break # if ref[i + 1] != 'G' and i != 0: # continue # # thekey = refbase + readbase + str(i) # if thekey in mismatch_dict.keys(): # addition = mismatch_dict[thekey] # addition += 1 # mismatch_dict[thekey] = addition # else: # mismatch_dict[thekey] = 1 # # if refbase == 'G': # if cpg: # if i - 1 >= readlen: # break # if ref[i - 1] != 'C': # continue # elif nocpg: # if i - 1 >= readlen: # break # if ref[i - 1] == 'C': # continue # elif udg_half: # if i - 1 >= readlen: # break # if ref[i - 1] != 'C' and z != 0: # continue # thekey = refbase + readbase + str(z) # if thekey in mismatch_dict_rev.keys(): # addition = mismatch_dict_rev[thekey] # addition += 1 # mismatch_dict_rev[thekey] = addition # else: # mismatch_dict_rev[thekey] = 1 # continue # compute degradation score # if i >= readlen:continue if refbase == 'C': # if cpg: # if i + 1 >= readlen: # break # if ref[i + 1] != 'G': # continue # elif cpg: # if i + 1 >= readlen: # break # if ref[i + 1] == 'G': # continue # elif udg_half: # if i + 1 >= readlen: # break # if ref[i + 1] != 'G' and i != 0: # continue if readbase == 'T': l_degrad = l_degrad * likelihood_mismatch( i, ancient_model_deam, quality, polymorphism_ancient) l_mut = l_mut * likelihood_mismatch( i, modern_model_deam, quality, polymorphism_contamination) if adjustbaseq: newprob = adjust_quality(i, ancient_model_deam, quality) newqual = prob2ascii(newprob) # print(phred2prob(ord(quality)), newprob) # print(ord(quality), newphred) # print(quality, newqual) # print(quals[0:i], quality, quals[(i+1):]) # print(quals) newquals = "%s%s%s" % ( quals[0:i], newqual, quals[(i + 1):] ) elif readbase == 'C': l_degrad = l_degrad * likelihood_match( i, ancient_model_deam, quality, polymorphism_ancient) l_mut = l_mut * likelihood_match( i, modern_model_deam, quality, polymorphism_contamination) # if pmds_prim and readbase in ['C', 'T', 'c', 't']: # l_degrad_max = l_degrad_max * likelihood_mismatch( # i, ancient_model_deam, quality, polymorphism_ancient) # l_mut_max = l_mut_max * likelihood_mismatch( # i, modern_model_deam, quality, polymorphism_ancient) # l_degrad_min = l_degrad_min * likelihood_match( # i, ancient_model_deam, quality, polymorphism_ancient) # l_mut_min = l_mut_min * likelihood_match( # i, modern_model_deam, quality, polymorphism_ancient) elif refbase == 'G': if cpg: if ref[i - 1] != 'C': continue elif nocpg: if ref[i - 1] == 'C': continue elif udg_half: if ref[i - 1] != 'C' and z != 0: continue if readbase == 'A': l_degrad = l_degrad * likelihood_mismatch( z, ancient_model_deam, quality, polymorphism_ancient) l_mut = l_mut * likelihood_mismatch( z, modern_model_deam, quality, polymorphism_contamination) if adjustbaseq: newprob = adjust_quality(z, ancient_model_deam, quality) newqual = prob2ascii(newprob) newquals = "%s%s%s" % ( quals[0:i], newqual, quals[(i + 1):] ) elif readbase == 'G': l_degrad = l_degrad * likelihood_match( z, ancient_model_deam, quality, polymorphism_ancient) l_mut = l_mut * likelihood_match( z, modern_model_deam, quality, polymorphism_contamination) # if pmds_prim and readbase in ['G', 'A', 'g', 'a']: # l_degrad_max = l_degrad_max * likelihood_mismatch( # z, ancient_model_deam, quality, polymorphism_ancient) # l_mut_max = l_mut_max * likelihood_mismatch( # z, modern_model_deam, quality, polymorphism_ancient) # l_degrad_min = l_degrad_min * likelihood_match( # z, ancient_model_deam, quality, polymorphism_ancient) # l_mut_min = l_mut_min * likelihood_match( # z, modern_model_deam, quality, polymorphism_ancient) # calculate log-likelihood ratio as final PMD score pmds = math.log(l_degrad / l_mut) return (l_degrad, l_mut, pmds, newquals)
[docs]def main(): """Executable to calculate post-mortem degradation scores for SAM files. See ``--help`` for details on expected arguments. Takes input only on STDIN. Logs messages to STDERR and writes processed SAM file to STDOUT. """ parser = argparse.ArgumentParser( usage=("python %(prog)s <SAM formatted data with MD field present " "from stdin> [options]") ) # options to control PMD score calculation pmd_options = parser.add_argument_group( title="PMD parameters", description=("influence the geometric distribution used for PMD score " "calculation by changing these parameters") ) pmd_options.add_argument("--PMDpparam", action="store", type=float, dest="PMDpparam", help="parameter p in geometric probability \ distribution of PMD", default=0.3) pmd_options.add_argument("--PMDconstant", action="store", type=float, dest="PMDconstant", help="constant C in geometric probability \ distribution of PMD", default=0.01) pmd_options.add_argument("--polymorphism_ancient", action="store", type=float, dest="polymorphism_ancient", help="True biological polymorphism between the \ ancient individual and the reference sequence", default=0.001) pmd_options.add_argument("--polymorphism_contamination", action="store", type=float, dest="polymorphism_contamination", help="True biological polymorphism between the \ contaminants and the reference sequence", default=0.001) # options to filter SAM file filter_options = parser.add_argument_group( title="filters", description=("restrict analysis to a subset of reads by setting these " "thresholds") ) filter_options.add_argument("--dry", action="store_true", dest="dry", help="print SAM input without any filters", default=False) filter_options.add_argument("-n", "--number", action="store", type=int, dest="maxreads", help="stop after these many reads have been \ processed", default=(10 ** 20)) filter_options.add_argument("-c", "--chromosome", action="store", type=str, dest="chromosome", help="only process data from this chromosome", default=False) filter_options.add_argument("--perc_identity", type=float, dest="perc_identity", help="only output sequences with percent \ identity above this threshold", default=0.0) filter_options.add_argument("-m", "--requiremapq", action="store", type=int, dest="mapq", help="only process sequences with mapping \ quality at least this great", default=0) filter_options.add_argument("--readlength", action="store", type=int, dest="readlength", help="only process sequences with this read \ length", default=0) filter_options.add_argument("--maxlength", action="store", type=int, dest="maxlength", help="only process sequences with max this \ read length", default=300) filter_options.add_argument("--minlength", action="store", type=int, dest="minlength", help="only process sequences with min this \ read length", default=0) filter_options.add_argument("--maxGC", action="store", type=float, dest="maxGC", help="only process sequences with max this \ GC content of the aligning reference sequence", default=1.0) filter_options.add_argument("--minGC", action="store", type=float, dest="minGC", help="only process sequences with min this GC \ content of the aligning reference sequence", default=0.0) filter_options.add_argument("-q", "--requirebaseq", action="store", type=int, dest="baseq", help="only process bases with base quality at \ least this great", default=0) filter_options.add_argument("-t", "--threshold", type=float, dest="threshold", help="only output sequences with PMD score \ above this threshold", default=(-20000.0)) filter_options.add_argument("--upperthreshold", type=float, dest="upperthreshold", help="only output sequences with PMD score \ below this threshold", default=(1000000.0)) # options controlling handling of CpG contexts cpg_options = parser.add_argument_group( title="CpG context", description=("options controlling the handling of deamination in CpG " "contexts") ) # --CpG, --noCpG and --UDGhalf exclude each other cpg_excl = cpg_options.add_mutually_exclusive_group(required=False) cpg_excl.add_argument("--CpG", "--UDGplus", action="store_true", dest="cpg", help="only use Cs and Gs in CpG context", default=False) cpg_excl.add_argument("--noCpG", action="store_true", dest="nocpg", help="dont use Cs and Gs in CpG context", default=False) cpg_excl.add_argument("--UDGhalf", action="store_true", dest="udg_half", help=("only use Cs and Gs in CpG context, the first " "and last base are used regardless of " "dinucleotide context"), default=False) # cpg_options.add_argument("--UDGplus", # action="store_true", # dest="UDGplus", # help="only use Cs and Gs in CpG context \ # (synonymous with option --CpG)", # default=False) cpg_options.add_argument("--UDGminus", action="store_true", dest="UDGminus", help="use all bases (placeholder)", default=False) # options controlling output output_options = parser.add_argument_group( title="output", description="control type and detail of output" ) output_options.add_argument("--verbose", action="store_true", dest="verbose", help="verbose", default=False) output_options.add_argument("--header", action="store_true", dest="header", help="output the SAM header", default=False) output_options.add_argument("--writesamfield", action="store_true", dest="writesamfield", help="add 'DS:Z:<PMDS>' field to SAM output, \ will overwrite if already present", default=False) output_options.add_argument("--stats", action="store_true", dest="stats", help="output summarizing statistics to stderr", default=False) output_options.add_argument("--debug", action="store_true", dest="debug", help="show additional information for \ debugging", default=False) output_options.add_argument("-p", "--printDS", action="store_true", dest="printDS", help="print PMD scores", default=False) output_options.add_argument("--printalignments", action="store_true", dest="printalignments", help="print human readable alignments", default=False) output_options.add_argument("--platypus", action="store_true", dest="platypus", help="output big list of base frequencies for \ platypus", default=False) output_options.add_argument("-d", "--deamination", action="store_true", dest="deamination", help="output base frequencies in the read at \ positions where there are C or G in the \ reference", default=False) # options to modify calculation of percent identity perc_id_options = parser.add_argument_group( title="percent identity", description="options to modify calculation of percent identity" ) perc_id_options.add_argument("--include_deamination", action="store_true", dest="include_deamination", help="treat possibly deaminated T>C and A>G \ pairs as mismatches in calculation of \ percent identity", default=False) perc_id_options.add_argument("--include_indels", action="store_true", dest="include_indels", help="treat insertions and deletions as \ mismatches in calculation of percent \ identity", default=False) perc_id_options.add_argument("--include_unknown", action="store_true", dest="include_unknown", help="treat Ns in either read or reference \ as mismatch in calculation of percent \ identity", default=False) # remaining options parser.add_argument('--version', action='version', version='%(prog)s v0.55 (modified)') parser.add_argument("--PMDSprim", action="store_true", dest="pmds_prim", help="PMDSprim", default=False) parser.add_argument("--PMDSprimthreshold", action="store", type=float, dest="PMDSprimthreshold", help="PMDSprimthreshold", default=False) parser.add_argument("--first", action="store_true", dest="first", help="outputs the deamination rate at the first \ position only, but with a standard error", default=False) parser.add_argument("--range", action="store", type=int, dest="range", help="output deamination patterns for this many \ positions from the sequence terminus (default=30)", default=30) parser.add_argument("--noclips", action="store_true", dest="noclips", help="no clips", default=False) parser.add_argument("--noindels", action="store_true", dest="noindels", help="no indels", default=False) parser.add_argument("--onlyclips", action="store_true", dest="onlyclips", help="only clips", default=False) parser.add_argument("--onlydeletions", action="store_true", dest="onlydeletions", help="only deletions", default=False) parser.add_argument("--onlyinsertions", action="store_true", dest="onlyinsertions", help="only insertions", default=False) parser.add_argument("--nodeletions", action="store_true", dest="nodeletions", help="no deletions", default=False) parser.add_argument("--noinsertions", action="store_true", dest="noinsertions", help="no insertions", default=False) parser.add_argument("--notreverse", action="store_true", dest="notreverse", help="no reverse complement alignments", default=False) parser.add_argument("-a", "--adjustbaseq", action="store_true", dest="adjustbaseq", help="apply PMD-aware adjustment of base quality \ scores specific to C>T and G>A mismatches to the \ reference", default=False) parser.add_argument("--adjustbaseq_all", action="store_true", dest="adjustbaseq_all", help="apply PMD-aware adjustment of base quality \ scores regardless of observed bases", default=False) parser.add_argument("--samtoolspath", action="store", dest="samtoolspath", help="full path to samtools", default='samtools') parser.add_argument("--basecomposition", action="store_true", dest="basecomposition", help="basecomposition", default=False) parser.add_argument("-r", "--refseq", action="store", dest="refseq", help="refseq", default=False) parser.add_argument("--estimate", action="store_true", dest="estimate", help="two-terminus estimate of contamination", default=False) parser.add_argument("--estimatebase", action="store", type=int, dest="estimatebase", help="position of base used fortwo-terminus estimate \ of contamination", default=0) parser.add_argument("-b", "--basic", action="store", type=int, dest="basic", help="only output reads with a C>T mismatch this \ many basepairs from the 5' end", default=0) (options, args) = parser.parse_known_args() # if options.UDGplus: # options.CpG = True # maxlen = options.maxlength # pre-compute probabilities of deamination for 1000 bases under modern and # ancient models modern_model_deam = [i for i in repeat(0.001, 1000)] ancient_model_deam = [geometric(options.PMDpparam, i, options.PMDconstant) for i in range(1, 1000)] adjustment_model_deam = None if options.adjustbaseq_all: # constant is 0.0 here in contrast to model used to compute PMD scores adjustment_model_deam = [geometric(options.PMDpparam, i, 0.0) for i in range(1, 1000)] # base composition # start_count = 0 # rev_start_count = 0 # not_counted = 0 # imperfect = 0 # start_dict= {} # start_dict_rev = {} mismatch_dict = {} mismatch_dict_rev = {} # mismatch_dict_CpG = {} # mismatch_dict_CpG_rev = {} first_c = 0 first_t = 0 clipexcluded = 0 indelexcluded = 0 cigarextexcluded = 0 # reads excluded for extended CIGAR (N, H, P, not S) identityexcluded = 0 # reads excluded due to percent identity filter md_missing = 0 no_gc_excluded = 0 excluded_threshold = 0 passed = 0 noquals = 0 # maskings = 0 # # CCandCC = 0 # CTandCC = 0 # CCandCT = 0 # CTandCT = 0 # # estimator_list = [] composition_dict = {} composition_dict_rev = {} line_counter = 0 for line in sys.stdin: if '@' in line[0]: if options.header: print(line.rstrip('\n')) continue line_counter += 1 line = line.rstrip('\n') col = line.split('\t') if options.debug: print(col) # readname = col[0] position = int(col[3]) chromosome = col[2] if options.chromosome: if chromosome != options.chromosome: continue mapq = int(col[4]) read = col[9] readlen = len(read) quals = col[10] flag = col[1] position = int(col[3]) cigar = col[5] if len(quals) < 2: noquals += 1 continue if options.noinsertions: if 'I' in cigar: continue if options.nodeletions: if 'D' in cigar: continue if options.onlyinsertions: if 'I' not in cigar: continue if options.onlydeletions: if 'D' not in cigar: continue if options.noindels: if 'I' in cigar or 'D' in cigar: indelexcluded += 1 continue if options.noclips: if 'S' in cigar or 'H' in cigar or 'N' in cigar or 'P' in cigar: clipexcluded += 1 continue if options.onlyclips: if 'S' not in cigar: continue if 'H' in cigar or 'P' in cigar or 'N' in cigar: print("cigar found: %s" % (cigar,), "PMDtools only supports cigar operations M, I, S and D,", "the alignment has been excluded", file=sys.stderr) cigarextexcluded += 1 continue if mapq < options.mapq: continue # read length filter # TODO check for default values unnecessary, just remove defaults and # check for args directly if options.readlength > 0: if options.readlength != len(read): continue if options.minlength > 0: if options.minlength > len(read): continue if options.maxlength != 300: if options.maxlength < len(read): continue # chromosome filter if options.chromosome: if chromosome != options.chromosome: continue # check orientation of read if flag.isdigit(): if int(flag) & 16: reverse = True else: reverse = False else: if 'r' in flag: reverse = True else: reverse = False # filter reverse reads if options.notreverse: if reverse: continue # check for previously calculated PMD score ds_field = False if 'DS:Z:' in line: ds_field = True pmds = float(line.split('DS:Z:')[1].rstrip('\n').split()[0]) l_ratio = pmds # Recreate reference sequence from MD field if (ds_field is False or options.writesamfield or options.basic > 0 or options.perc_identity > 0.01 or options.printalignments or options.adjustbaseq or options.adjustbaseq_all or options.deamination or options.dry or options.estimate or options.first): read = col[9] try: md_field = line.split('MD:Z:')[1].split()[0].rstrip('\n') except IndexError: md_missing += 1 continue # using external library to reconstruct reference from CIGAR and MD rec = reconstruct(read, cigar, md_field) (rec_ref_full, rec_read_full, rec_ref_aln, rec_read_aln) = rec if reverse: read = reverse_complement(read) # don't use raw read beyond here... rec_ref_full = reverse_complement(rec_ref_full) rec_read_full = reverse_complement(rec_read_full) rec_ref_aln = reverse_complement(rec_ref_aln) rec_read_aln = reverse_complement(rec_read_aln) quals = quals[::-1] # this assumption previously lead to trouble because of missing # gaps in aligned read! real_read = read real_ref_seq = rec_ref_aln # debugging the reference sequence reconstruction if options.debug: print("%s\tread\n%s\tref" % (real_read, real_ref_seq)) print("CIGAR:\t%s" % (cigar,)) print("MD:\t%s" % (md_field,)) # GC content filter if options.maxGC < 1.0 or options.minGC > 0.0: gc_content = gc(real_ref_seq) if gc_content > options.maxGC or gc_content < options.minGC: continue # test for empty reference sequence TODO include in reconstruction # method and throw exception? if ('G' not in real_ref_seq and 'C' not in real_ref_seq and 'T' not in real_ref_seq and 'A' not in real_ref_seq): print('bad reference sequence reconstruction: %s' % real_ref_seq, file=sys.stderr) print('SAM line: %s' % (line,)) exit(1) if options.basecomposition: backoffset = 10 if reverse: endpos = position startpos = position + len(real_read) else: startpos = position endpos = position + len(real_read) # 5' end largerefseq = fa_get(options.refseq, chromosome, startpos - backoffset, startpos + options.range, options.samtoolspath) if len(largerefseq) < 1: continue # print largerefseq if reverse: largerefseq = reverse_complement(largerefseq) # print(largerefseq, real_read) for i in range(-backoffset, options.range): # print(i+backoffset, len(largerefseq)) base = largerefseq[min([i + backoffset, len(largerefseq)])] thekey = '5' + base + str(i) if thekey in composition_dict.keys(): addition = composition_dict[thekey] addition += 1 composition_dict[thekey] = addition else: composition_dict[thekey] = 1 # 3' end largerefseq = fa_get(options.refseq, chromosome, endpos - options.range, endpos + backoffset, options.samtoolspath) if len(largerefseq) < 1: continue if reverse: largerefseq = reverse_complement(largerefseq) for i in range(-backoffset, options.range): base = largerefseq[min([i + backoffset, len(largerefseq)])] thekey = '3' + base + str(i) if thekey in composition_dict_rev.keys(): addition = composition_dict_rev[thekey] addition += 1 composition_dict_rev[thekey] = addition else: composition_dict_rev[thekey] = 1 continue # useless? # basic filter # prints the SAM line if a C>T mismatch with sufficient base quality is # observed in the first n bases, where n is specified if options.basic > 0: for read, ref, pos in zip(real_read, real_ref_seq, range(0, len(real_ref_seq))): if read == 'N': break elif ref == 'N': break elif read == '-': break elif ref == '-': break i = pos # - start_position if i >= readlen: # 20 break if i >= options.basic: break if options.cpg: if (ref == 'C' and read == 'T' and ord(quals[i] - 33) > options.baseq and real_ref_seq[i + 1] == 'G'): print(line.rstrip('\n')) break elif (ref == 'C' and read == 'T' and ord(quals[i]) - 33 > options.baseq): print(line.rstrip('\n')) break # first base # prints the deamination rate at the first base and a standard error # computed by jackknife over reads if options.first: ref = real_ref_seq[0] read = real_read[0] if options.cpg: if (ref == 'C' and read == 'T' and (ord(quals[0])-33) > options.baseq and real_ref_seq[1] == 'G'): first_t += 1 if (ref == 'C' and read == 'C' and (ord(quals[0])-33) > options.baseq and real_ref_seq[1] == 'G'): first_c += 1 else: if (ref == 'C' and read == 'T' and ord(quals[0]) - 33 > options.baseq): first_t += 1 if (ref == 'C' and read == 'C' and ord(quals[0]) - 33 > options.baseq): first_c += 1 if options.perc_identity > 0.01 or options.printalignments: # divergence filter (perc_identity, mismatch_string) = aln_identity( rec_read_aln, rec_ref_aln, options.include_indels, options.include_deamination, options.include_unknown ) if perc_identity < options.perc_identity: identityexcluded += 1 continue # start PMD score computations if (ds_field is False or (ds_field is True and options.writesamfield is True) or options.basic > 0 or options.adjustbaseq or options.adjustbaseq_all or options.deamination or options.dry): (l_degrad, l_mut, l_ratio, newquals) = score_pmd( rec_read_full, # was: real_read rec_ref_full, # was: real_ref_seq quals, ancient_model_deam, modern_model_deam, adjustment_model_deam, options.polymorphism_contamination, options.polymorphism_ancient, options.adjustbaseq_all, options.adjustbaseq, options.baseq ) # TODO re-enable these options step by step... # options.deamination, # options.cpg, # options.nocpg, # options.udg_half, # options.pmds_prim) # if options.pmds_prim: # maxPMDSval = math.log(l_degrad_max / l_mut_max) # maxPMDSval = maxPMDSval / readlen # if l_ratio > 0.0: # LRnumerator = math.log(l_degrad_max / l_mut_max) # elif l_ratio < 0.0: # LRnumerator = math.log(l_degrad_max / l_mut_max) # if options.PMDSprimthreshold: # if maxPMDSval < options.PMDSprimthreshold: # continue # # if options.printDS: # print(l_ratio, maxPMDSval, maxPMDSval, # maxPMDSval * readlen, readlen) ## l_ratio = l_ratio / LRnumerator # quals = newquals if options.adjustbaseq: if reverse: qualsp = quals[::-1] else: qualsp = quals line = ('\t'.join(col[0:10]) + '\t' + qualsp + '\t' + '\t'.join(col[11:])) # add PMDS tag if options.writesamfield is True: # remove DS field if present if ds_field is True: newline = '' for col in line.split('\t'): if 'DS:Z:' in col: continue else: newline += col + '\t' line = newline.rstrip('\t') line = line.rstrip('\n') + '\t' + 'DS:Z:' + str(round(l_ratio, 3)) if options.printDS: print(l_degrad, '\t', l_mut, '\t', l_degrad / l_mut, '\t', l_ratio) # print(l_degrad, '\t', l_mut, '\t', l_degrad / l_mut, '\t', # l_ratio, '\t', # readlen, '\t', # perc_identity, '\t', # perc_identity * math.log(l_degrad / l_mut)) if options.dry: if len(line) < 1: continue print(line.rstrip('\n')) continue if options.threshold > (-10000) or options.upperthreshold < (1000000): if (l_ratio >= options.threshold and l_ratio < options.upperthreshold): print(line.rstrip('\n')) else: excluded_threshold += 1 if options.printalignments: if (options.threshold > -10000 or options.upperthreshold < 1000000): try: l_ratio = math.log(l_degrad / l_mut) except: continue if (l_ratio < options.threshold or l_ratio > options.upperthreshold < 1000000): continue quals1 = '' quals2 = '' for qual in quals: qnum = ord(qual) - 33 if qnum < 10: quals1 += '0' quals2 += str(qnum) else: quals1 += str(qnum)[0] quals2 += str(qnum)[1] # print(md_field, cigar, reverse) # print(col[9]) print(real_read) print(mismatch_string) print(real_ref_seq) print(quals) # print(quals1) # print(quals2) # print(col[10]) print('') passed += 1 if passed >= options.maxreads: break if options.first: first_ct = first_c + first_t freq = 1.0 * first_t / first_ct se = math.sqrt((freq * (1.0 - freq)) / first_ct) if freq == 0.0: se = 'NA' print('C>T at first position and SE:', freq, '\t', se) # print('C>T at first position and SE:', freq, '\t', se, '\t', # n, first_c, first_t) if options.stats: print('""""""""""""""""""""""""""""""""', file=sys.stderr) # added stats print("excluded due to extended CIGAR (H, N, P):\t%i" % (cigarextexcluded,), file=sys.stderr) print("excluded due percent identity filter:\t%i" % (identityexcluded,), file=sys.stderr) print("excluded due to clipping:\t%i" % (clipexcluded,), file=sys.stderr) print("excluded due to indels:\t%i" % (indelexcluded,), file=sys.stderr) print("no MD field:\t%i" % (md_missing,), file=sys.stderr) print("no G or C in ref:\t%i" % (no_gc_excluded,), file=sys.stderr) print("total seqs:\t%i" % (passed,), file=sys.stderr) print("excluded due to PMD score < %i:\t%i" % (int(options.threshold), excluded_threshold), file=sys.stderr) print("passed seqs:\t%i" % (passed - excluded_threshold,), file=sys.stderr) print('""""""""""""""""""""""""""""""""', file=sys.stderr) if options.deamination: if True: pairs = ['CT', 'CA', 'CG', 'CC', 'GA', 'GT', 'GC', 'GG'] itotaldict = {} ztotaldict = {} for i in range(0, options.range): itotal = 0 ztotal = 0 for pair in pairs: thekey = pair + str(i) try: itotal += mismatch_dict[thekey] except KeyError: pass try: ztotal += mismatch_dict_rev[thekey] except KeyError: pass itotaldict[i] = itotal ztotaldict[i] = ztotal print('z\t', '\t'.join(pairs)) for i in range(0, options.range): print(str(i) + '\t') for pair in pairs: thekey = pair + str(i) if 'C' in pair[0]: try: thecount = mismatch_dict[thekey] except KeyError: print('0.00000\t') continue thetotal = itotaldict[i] frac = 1.0 * thecount / thetotal if 'G' in pair[0]: try: thecount = mismatch_dict_rev[thekey] except KeyError: print('0.00000\t') continue thetotal = ztotaldict[i] frac = 1.0 * thecount / thetotal print(str(round(frac, 5)) + '\t') print('') if options.basecomposition: print(composition_dict) print(composition_dict_rev) if True: pairs = ['5T', '5A', '5G', '5C', '3T', '3A', '3G', '3C'] itotaldict = {} ztotaldict = {} for i in range(-backoffset, options.range): itotal = 0 ztotal = 0 for pair in pairs: thekey = pair + str(i) try: itotal += composition_dict[thekey] except KeyError: pass try: ztotal += composition_dict_rev[thekey] except KeyError: pass itotaldict[i] = itotal ztotaldict[i] = ztotal print('z\t', '\t'.join(pairs)) for i in range(-backoffset, options.range): print(str(i) + '\t') for pair in pairs: thekey = pair + str(i) if '5' in pair[0]: try: thecount = composition_dict[thekey] except KeyError: print('0.00000\t') continue thetotal = itotaldict[i] frac = 1.0 * thecount / thetotal if '3' in pair[0]: try: thecount = composition_dict_rev[thekey] except KeyError: print('0.00000\t') continue thetotal = ztotaldict[i] frac = 1.0 * thecount / thetotal print(str(round(frac, 5)) + '\t') print('') exit()
if __name__ == "__main__": main()