Source code for samsifter.util.genetics

# -*- coding: utf-8 -*-
"""Library of standard methods for manipulation of genetic sequences.

.. moduleauthor:: Florian Aldehoff (f.aldehoff@student.uni-tuebingen.de)
"""


[docs]def is_iupac(seq): """Check nucleotide sequence for invalid (non-IUPAC) codes. Parameters ---------- seq : str Nucleotide sequence. Returns ------- bool True if sequence contains only IUPAC ambiguity codes, otherwise False. """ iupac = set(('A', 'T', 'C', 'G', 'Y', 'R', 'W', 'S', 'K', 'M', 'B', 'H', 'D', 'V', 'X', 'N', '-', 'a', 't', 'c', 'g', 'y', 'r', 'w', 's', 'k', 'm', 'b', 'h', 'd', 'v', 'x', 'n')) return set(seq).issubset(iupac)
[docs]def opposite(base): """Mirror base code into the exact opposite IUPAC ambiguity code. 'A' becomes 'not A', gap becomes any base, etc. Parameters ---------- base : str Single one-letter IUPAC nucleotide code. Returns ------- str Opposite one-letter IUPAC ambiguity code. None If base is invalid. """ opposites = {'A': 'B', 'G': 'H', 'C': 'D', 'T': 'V', 'Y': 'R', # pyrimidine | purin 'R': 'Y', # purine | pyrimidine 'W': 'S', # weak | strong 'S': 'W', # strong | weak 'K': 'M', # keto | amino 'M': 'K', # amino | keto 'B': 'A', 'H': 'G', 'D': 'C', 'V': 'T', 'X': '-', 'N': '-', '-': 'N', # ____________________ 'a': 'b', 'g': 'h', 'c': 'd', 't': 'v', 'y': 't', # pyrimidine | purin 'r': 'y', # purine | pyrimidine 'w': 's', # weak | strong 's': 'w', # strong | weak 'k': 'm', # keto | amino 'm': 'k', # amino | keto 'b': 'a', 'h': 'g', 'd': 'c', 'v': 't', 'x': '-', 'n': '-'} try: opp = opposites[base] except KeyError: return None return opp
[docs]def complement(base): """Returns a base's complement according to IUPAC or None. Parameters ---------- base : str Single one-letter IUPAC nucleotide code. Returns ------- str Complementary one-letter IUPAC ambiguity code. None If base is invalid. """ # complements = {'A': 'T', # 'G': 'C', # 'C': 'A', # 'T': 'G', # 'Y': 'R', # pyrimidine > purin # 'R': 'Y', # purine > pyrimidine # 'W': 'W', # weak: A or T # 'S': 'S', # strong: G or C # 'K': 'M', # keto > amino # 'M': 'K', # amino > keto # 'D': 'H', # 'V': 'B', # 'H': 'D', # 'B': 'V', # 'X': 'X', # 'N': 'N', # '-': '-', # ____________________ # 'a': 't', # 'g': 'c', # 'c': 'a', # 't': 'g', # 'y': 'r', # 'r': 'y', # 'w': 'w', # 's': 's', # 'k': 'm', # 'm': 'k', # 'd': 'h', # 'v': 'b', # 'h': 'd', # 'b': 'v', # 'x': 'x', # 'n': 'n'} # try: # complement = complements[base] # except KeyError: # return None # return complement # not pretty but a lot faster! if base == 'A': return 'T' elif base == 'G': return 'C' elif base == 'C': return 'A' elif base == 'T': return 'G' elif base == 'a': return 't' elif base == 'g': return 'c' elif base == 'c': return 'a' elif base == 't': return 'g' elif base == 'Y': return 'R' elif base == 'R': return 'Y' elif base == 'W': return 'W' elif base == 'S': return 'S' elif base == 'K': return 'M' elif base == 'M': return 'K' elif base == 'D': return 'H' elif base == 'V': return 'B' elif base == 'H': return 'D' elif base == 'B': return 'V' elif base == 'X': return 'X' elif base == 'N': return 'N' elif base == '-': return '-' elif base == 'y': return 'r' elif base == 'r': return 'y' elif base == 'w': return 'w' elif base == 's': return 's' elif base == 'k': return 'm' elif base == 'm': return 'k' elif base == 'd': return 'h' elif base == 'v': return 'b' elif base == 'h': return 'd' elif base == 'b': return 'v' elif base == 'x': return 'x' elif base == 'n': return 'n' else: return None
[docs]def reverse(seq): """Reverses a sequence. Parameters ---------- seq : str Any sequence. Returns ------- str Reverse sequence. """ return seq[::1]
[docs]def reverse_complement(seq): """Reverse complement a nucleotide sequence. Parameters ---------- seq : str A nucleotide sequence of IUPAC ambuigity codes. Returns ------- str Reverse complement of the nucleotide sequence. """ # rev = seq[::-1] # revcomp = '' # for base in rev: # revcomp += complement(base) # return revcomp complements = [complement(base) for base in seq[::-1]] revcomp = "".join(complements) return revcomp
[docs]def transcribe(dna): """Transcription of DNA sequence to corresponding RNA sequence. Parameters ---------- dna : str DNA nucleotide sequence. Returns ------- str RNA nucleotide sequence. """ return dna.replace('T', 'U').replace('t', 'u')
[docs]def reverse_transcribe(rna): """Reverse transcription of RNA sequence to corresponding cDNA sequence. Parameters ---------- rna : str RNA nucleotide sequence. Returns ------- str DNA nucleotide sequence. """ return rna.replace('U', 'T').replace('u', 't')
[docs]def gc(seq): """Calculate GC content of a nucleotide sequence. Considers all IUPAC codes using 6x counts to deal with 1/2 and 1/3 probabilities from ambiguity codes. Parameters ---------- seq : str A nucleotide sequence. Returns ------- float GC content as fraction of 1. """ if len(seq) == 0: return 0.0 gc_at_values = {'A': (0, 6), 'G': (6, 0), 'C': (6, 0), 'T': (0, 6), 'Y': (3, 3), # pyrimidine (C or T) 'R': (3, 3), # purine (A or G) 'W': (0, 6), # weak (A or T) 'S': (6, 0), # strong (G or C) 'K': (3, 3), # keto (T or G) 'M': (3, 3), # amino (A or C) 'B': (4, 2), # not A 'H': (2, 4), # not G 'D': (2, 4), # not C 'V': (4, 2), # not T 'X': (3, 3), # any 'N': (3, 3), # any '-': (0, 0), # ____________________ 'a': (0, 6), 'g': (6, 0), 'c': (6, 0), 't': (0, 6), 'y': (3, 3), 'r': (3, 3), 'w': (0, 6), 's': (6, 0), 'k': (3, 3), 'm': (3, 3), 'b': (4, 2), 'h': (2, 4), 'd': (2, 4), 'v': (4, 2), 'x': (3, 3), 'n': (3, 3)} gcs = 0 ats = 0 for base in seq: gcs += gc_at_values[base][0] ats += gc_at_values[base][1] return gcs / ((ats + gcs) * 1.0)
[docs]def aln_identity(read, ref, include_indels=False, include_deamination=False, include_unknown=False): """Determines identity of two sequences in an alignment. Calculates modified Hamming distance of an alignment with gaps with optional exclusion of possibly deaminated T>C and A>G as well as indels. Enabling all three optional parameters will calculate values identical to MALT while the default settings will calculate values identical to PMDtools. Parameters ---------- read : str Full read sequence. ref : str Corresponding reference sequence (only the aligned part, but including any gaps, skips and indels). include_indels : bool, optional Consider insertions and deletions, defaults to False. include_deamination : bool, optional Consider any potentially deaminated bases, defaults to False. include_unknown : bool, optional Consider any unknown bases (N), defaults to False. Returns ------- float Identity of read and reference as fraction of 1. str Mismatch string (match = ``|``, mismatch = ``x``, indel = ``-``). Example ------- Showing differences between PMDtools and MALT:: TCCAGCAGGTCGATGACCTTGATGCCGGTCTCGAACATCTTCA ||-|||||||||||||||||||||||||||||||||||||-|x TC-AGCAGGTCGATGACCTTGATGCCGGTCTCGAACATCT-CG ]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]] 43 bases 2 gaps 1 total mismatches 1 G>A mismatches (A in read but G in reference) 0 C>T mismatches (T in read but C in reference) 0 other mismatches 40 matches PMD: 40 / 40 = 100% identity = 1.0 * matches / (matches + other mismatches) = 1.0 * 40 / (40 + 0) = 1.0 MALT: 40 / 43 = 93% identity = 1.0 * matches / (matches + gaps + total mismatches) = 1.0 * 40 / (40 + 2 + 1) = 0.930232 ~ 0.93 Thus it is required to include indels, potentially deaminated bases and unknown bases to produce identity values similar to MALT. """ # this should never happen if reference was correctly reconstructed if len(read) != len(ref): raise ValueError("Undefined for sequences of unequal length") match = 0 mismatch = 0 mismatch_string = '' for a, b in zip(read, ref): thesebases = [a, b] if '-' in thesebases: mismatch_string += '-' # default: exclude indels from mismatches if include_indels: mismatch += 1 continue if a == 'N': if include_unknown: mismatch += 1 else: continue if b == 'N': if include_unknown: mismatch += 1 else: continue if a == b: mismatch_string += '|' match += 1 elif a != b: mismatch_string += 'x' # default: exclude possible deamination from mismatches if not include_deamination: if 'C' == b and 'T' == a: continue if 'G' == b and 'A' == a: continue mismatch += 1 try: perc_identity = 1.0 * match / (match + mismatch) except ZeroDivisionError: perc_identity = 0.0 return (perc_identity, mismatch_string)