#!/usr/bin/env python
#############################################################

import argparse

parser = argparse.ArgumentParser(description="Collate the mapping edit distances of BAM files, for genome-to-genome or transcriptome-to-transcriptome cross-mapping.")
parser.add_argument("-b", "--bam", type=str, nargs="+", help="BAM files to check", required=True)
parser.add_argument("-s", "--source", type=str, help="GTF file containing annotation of source genome", required=True)
parser.add_argument("-a", "--aligned", type=str, help="GTF file containing annotation of aligned genome", required=True)
parser.add_argument("-o", "--out", type=str, help="output file", required=True)
parser.add_argument("-q", "--minqual", type=int, help="minimum MAPQ score to consider an alignment (default: 10)", default=10)

args = parser.parse_args()

#############################################################
# Preparing the GTF objects.

import HTSeq
source_gtf = HTSeq.GFF_Reader(args.source)
source_features = HTSeq.GenomicArrayOfSets("auto", stranded=False)

for feature in source_gtf:
    if feature.type == "exon":
        source_features[feature.iv] += feature.attr["gene_id"]

dest_gtf = HTSeq.GFF_Reader(args.aligned)
dest_features = HTSeq.GenomicArrayOfSets("auto", stranded=False)

for feature in dest_gtf:
    if feature.type == "exon":
        dest_features[feature.iv] += feature.attr["gene_id"]

#############################################################

import os
OHANDLE = open(args.out, mode="w")

import re

for bf in args.bam:
    almnt_file = HTSeq.BAM_Reader(bf)
    cur_chr = re.sub("\\.bam$", "", os.path.basename(bf))
    counts_genome = [0]
    counts_transcriptome = [0]
    counts_transcriptome_edit = [0]
    ref_rlen = -1

    for almnt in almnt_file:
        # Constructing the reference alignment.
        total_rlen = len(almnt.read)
        if ref_rlen < 0:
            ref_rlen = total_rlen
            counts_genome += [0] * total_rlen
            counts_transcriptome += [0] * total_rlen
            counts_transcriptome_edit += [0] * total_rlen
        elif ref_rlen != total_rlen:
            raise RuntimeError, "differing read lengths are not supported"

        ref_pos = int(almnt.read.name)
        s_almnt = HTSeq.Alignment(almnt.read, HTSeq.GenomicInterval(cur_chr, ref_pos, ref_pos+total_rlen, "+"))
        source_ids = set()
        for iv, val in source_features[s_almnt.iv].steps():
            source_ids |= val

        # Checking whether it's actually aligned to the new genome
        if not almnt.aligned or almnt.aQual < args.minqual:
            counts_genome[0] += 1
            if len(source_ids):
                counts_transcriptome[0] += 1
                counts_transcriptome_edit[0] += 1
            continue
        
        # Computing the number of matching bases (edit distance includes indels, so we add them back in)
        edit_distance = almnt.optional_field("NM")
        total_match =- edit_distance
        for cigop in almnt.cigar:
            if cigop.type in "MDI":
                total_match += cigop.size
        counts_genome[total_match] += 1

        # Checking whether they align in both transcriptomes, where we count it as a transcriptomic mismatch
        if len(source_ids): 
            dest_ids = set()
            for iv, val in dest_features[almnt.iv].steps():
                dest_ids |= val

            if len(dest_ids):
                counts_transcriptome[total_match] += 1
            
                # We also use the edit distance - cross-mapping across splice junctions might 
                # have low matches due to soft clipping. Using the total read length to store 
                # the edit distances (i.e., last entry has edit of 0).
                lindex = max(0, total_rlen - edit_distance)
                counts_transcriptome_edit[lindex] += 1
            else:
                counts_transcriptome[0] += 1
                counts_transcriptome_edit[0] += 1

    # Printing diagnostics to file
    print >> OHANDLE, bf + "\tGENOME\tMATCH",
    for y in counts_genome:
        print >> OHANDLE, "\t" + str(y),
    print >> OHANDLE

    print >> OHANDLE, bf + "\tTRANS\tMATCH",
    for y in counts_transcriptome:
        print >> OHANDLE, "\t" + str(y),
    print >> OHANDLE

    print >> OHANDLE, bf + "\tTRANS\tEDIT",
    for y in counts_transcriptome_edit:
        print >> OHANDLE, "\t" + str(y),
    print >> OHANDLE

#############################################################
# Closing handles.

OHANDLE.close()

#############################################################
# End.