Source code for samsifter.util.filters

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Basic filtering operations on files.

.. moduleauthor:: Florian Aldehoff <samsifter@biohazardous.de>
"""
import re
import sys
import fileinput


[docs]def pattern_filter(patterns, filehandle, discard=True): """Emulates grep-like inverse pattern search. Emulates the behaviour of ``grep -v -f PATTERNFILE`` and prints only non-matching lines to STDOUT. Inverse operation to print only lines matching at least one of the patterns by setting discard to False. Parameters ---------- patterns : list of str List of string patterns to search. filehandle : File Opened and readable file object. discard : bool, optional Print only lines matching none of the patterns to STDOUT. Defaults to True. Returns ------- bool True on successful filtering, False on empty pattern list or other error. """ if len(patterns) == 0: return False # create long search string of patterns separated by | (OR) # escape '|' in individual patterns, eg. for taxa and references searchstring = re.escape(patterns[0]) for pattern in patterns[1:]: searchstring += "|" + re.escape(pattern) # print("search string: %s\n" % searchstring, file=sys.stderr) for line in filehandle: if line.startswith('@'): print(line.rstrip(), file=sys.stdout) else: result = re.search(searchstring, line) if result: if not discard: print(line.rstrip(), file=sys.stdout) else: if discard: print(line.rstrip(), file=sys.stdout) return True
[docs]def line_filter(lines, filehandle, discard=True, offset=0): """Filters specific lines from a file. Prints only lines not contained in list to STDOUT while ignoring the number of lines in the beginning as defined by offset (default is 0 = no header). Inverse operation to print only lines contained in list by setting ``discard`` to False. Parameters ---------- lines : list of int List of line numbers to remove from file. Line numbers are considered to be 0-based unless an offset is specified. List can be unsorted and duplicates will be removed prior to filtering. filehandle : File Opened and readable file object. discard : bool, optional Print only lines matching none of the entries to STDOUT. Defaults to True. offset: int, optional Positive offset for line numbers to be used in case the cursor of the filehandle has placed after the start of the line numbering ('fast forward'). Useful to skip header sections of a file. Returns ------- bool True on success, False on empty list. Raises ------ Exception If file ends before all specified lines are filtered (indicating wrong use of the offset parameter). IndexError If first line to be filtered is within the specified offset. ValueError If offset is negative. """ if len(lines) == 0: return False if offset < 0: raise ValueError("negative filtering offset %i not supported" % offset) # sort lines to be filtered and remove duplicate entries lines = sorted(list(set(lines)), reverse=True) next_line = lines.pop() # prevent filtering of negative indices (first line to be filtered is # within offset) if next_line < offset: raise IndexError("first filtered line %i < offset %i" % (next_line, offset)) for num, line in enumerate(filehandle): if (next_line is not None) and (num == next_line - offset): if not discard: print(line.rstrip(), file=sys.stdout) try: next_line = lines.pop() except IndexError: next_line = None else: if discard: print(line.rstrip(), file=sys.stdout) # warn if there are still lines left to be filtered if next_line is not None: raise Exception("%i unfiltered line(s)" % (len(lines) + 1)) return True
[docs]def main(): """Simple test of pattern_filter and line_filter methods.""" handle = fileinput.input() # textfile = '/home/aldehoff/sandbox/alice.txt' # handle = open(textfile, mode='r') # patterns = ["lexicon", "fox", "Rabbit"] # pattern_filter(patterns, handle, False) lines = [2, 3, 4, 20, 44, 34] # Result should include desired line numbers in correct order. If offset # is given the appropriate numbers of lines should be skipped. print(line_filter(lines, handle, False, offset=0), file=sys.stderr)
if __name__ == '__main__': main()