Source code for samsifter.util.papertrail
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Annotation of SAM file headers with metadata on applied programs.
.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""
import sys
import fileinput
[docs]def find_last_in_chain(current_id, pg_entries):
"""Recursively identify the last program entry in the chain.
@PG entries in SAM file headers can be chained by linking each entry to the
previously applied program using the optional PP tag. The actual last entry
of the chain can only be found under the assumption that all programs are
listed within one chain.
Note
----
May lead to wrong assumptions about the actual order of programs applied to
dataset if the chain of PG entries is interrupted by missing PP tags. Use
at own risk!
Parameters
----------
current_id : str
Unique ID of program entry that serves as arbitrary start point for the
recursive search of the chain end.
pg_entries : dict of dicts
Dictionary of @PG record tags (key:value) referenced by their unique
program ID.
Returns
-------
str
Unique ID of the assumed last entry in the program chain (see note
above!)
"""
for entry, tags in pg_entries.items():
try:
if current_id == tags['PP']:
# another entry follows
return find_last_in_chain(entry, pg_entries)
except KeyError:
# entry has no PP tag
continue
# no other entry follows
return current_id
[docs]def create_pg_line(identifier, name=None, command_line=None, previous=None,
description=None, version=None):
"""Create a @PG header for SAM files.
Follows SAMv1 specification, see
https://samtools.github.io/hts-specs/SAMv1.pdf
Parameters
----------
identifier : str
Unique program identifier within the context of the current SAM file
that is identical to the identifier used in the alignment section. May
be modified by later processing steps.
name : str, optional
Program name, defaults to None.
command_line : str, optional
Program command line excluding program name/call, just the optional and
positional arguments. Defaults to None.
previous : str, optional
Program identifier of previous step in program chain.
description : str, optional
Short description of program functionality.
version : str, optional
Program version number/string.
Returns
-------
str
Complete standard-compliant @PG header line.
"""
line = '@PG'
# Program record identifier. Each @PG line must have a unique ID. The value
# of ID is used in the alignment PG tag and PP tags of other @PG lines. PG
# IDs may be modified when merging SAM files in order to handle collisions.
pg_id = 'ID:%s' % identifier
line += '\t' + pg_id
# Program name, optional
if name is not None:
pg_pn = 'PN:%s' % name
line += '\t' + pg_pn
# Command line, optional
if command_line is not None:
pg_cl = 'CL:%s' % command_line
line += '\t' + pg_cl
# Previous @PG-ID. Must match another @PG header’s ID tag. @PG records may
# be chained using PP tag, with the last record in the chain having no PP
# tag. This chain defines the order of programs that have been applied to
# the alignment. PP values may be modified when merging SAM files in order
# to handle collisions of PG IDs. The first PG record in a chain (i.e. the
# one referred to by the PG tag in a SAM record) describes the most recent
# program that operated on the SAM record. The next PG record in the chain
# describes the next most recent program that operated on the SAM record.
# The PG ID on a SAM record is not required to refer to the newest PG
# record in a chain. It may refer to any PG record in a chain, implying
# that the SAM record has been operated on by the program in that PG
# record, and the program(s) referred to via the PP tag.
if previous is not None:
pg_pp = 'PP:%s' % previous
line += '\t' + pg_pp
# Description, optional
if description is not None:
pg_ds = 'DS:%s' % description
line += '\t' + pg_ds
# Program version, optional
if version is not None:
pg_vn = 'VN:%s' % version
line += '\t' + pg_vn
line += '\n'
return line
[docs]def main():
"""Simple test of PG header generation."""
handle = fileinput.input(sys.argv)
header = []
for line in handle:
if line.startswith('@'):
header.append(line)
handle.close()
modified = modify_header(header, name='Test')
for line in modified:
print(line.rstrip(), file=sys.stdout)
if __name__ == "__main__":
main()