#! /usr/bin/python2.2 """ Demo code for range contiguation. In files, ranges are written '-', one per line; in computation, they are handled as 2-tuples like (start, end). Start and end are inclusive. This module defines the relevant functions, and also runs as a standalone script which reads a range file from stdin and writes a contig file to stdout. """ import re import sys import bisect rangeRe = re.compile( r"([0-9]+)-([0-9]+)" ) def read( file ): ranges = [] for line in file: match = rangeRe.match( line ) if (not match): raise ValueError, "bad range line" startStr = match.group( 1 ) endStr = match.group( 2 ) start = int( startStr ) end = int( endStr ) range = (start, end) ranges.append( range ) return ranges def write( ranges, file ): for range in ranges: file.write( str( range[0] ) ) file.write( "-" ) file.write( str( range[1] ) ) file.write( "\n" ) def validate( ranges ): "Validates a sequence of ranges." for range in ranges: if (range[0] < 0): raise ValueError, "range out of range" if (range[0] > range[1]): raise ValueError, "range inside-out" def contiguate( ranges ): "Takes a sequence of possibly overlapping ranges, and returns a list of non-overlapping ranges. Ranges are specified as inclusive (start, end) pairs. The range sequence must be sorted." contigs = [] start = None end = -1 for range in ranges: if (range[0] <= end): if (range[1] > end): end = range[1] else: if (start != None): contigs.append( (start, end) ) start = range[0] end = range[1] contigs.append( (start, end) ) return contigs def total( ranges ): "Calculates the total length of a sequence of ranges." sum = 0 for range in ranges: len = (range[1] - range[0]) + 1 sum = sum + len return sum def find( i, ranges ): "Find the range containing a given index, or None if there isn't one." j = bisect.bisect( ranges, ((i + 1), 0) ) - 1 if (j == -1): return None range = ranges[j] if (i <= range[1]): return range else: return None def main_contiguate(): ranges = read( sys.stdin ) validate( ranges ) ranges.sort() contigs = contiguate( ranges ) write( contigs, sys.stdout ) sys.stderr.write( ("range total: " + str( total( ranges ) ) + "\n") ) sys.stderr.write( ("contig total: " + str( total( contigs ) ) + "\n") )