NHS_WES_generate_coverage_result_file.py

#	given 
#		<gene_set>.<ID>.sample_interval_summary					- e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
#		<gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt		- e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
#
#	generate a coverage output file
#		all the columns from the coverage file for this regions+ another column with the number of P/LP (ACP) ClinVar variants in this region
#		all the columns from the ClinVar file + another column with the percentage of bases covered at least 20x from the coverage file
#
#       Author: MH
#       last modified: SEPT 26, 2019


import sys
import os
import csv
import gzip


COV_DICT = {}	#	key: 'chr:start-1:end'; value: coverage column (data[8])


def go(in_gatk_file,in_clin_file,out_cov_file):

    # read in the coverage file
    read_coverage(in_gatk_file)

    out_han = open(out_cov_file,'w')
    out_cntr = 0
    out_han.write('chr\tstart\tend\tgene\tnum ClinVar P/LP (ACP)\tpercent bases covered > 20x\n')

    in_han = open(in_clin_file,'r')
    in_cntr = 0

    for line in in_han:
        in_cntr += 1 
        data = [x.strip() for x in line.strip().split('\t')]    
        chr = data[0]
        start = int(data[1])
        end = int(data[2])
        key = '%s:%s:%s' % (chr,start,end)
        if key in COV_DICT:
            cov = COV_DICT[key]
        else:
            print "ERROR: cannot find the coverage for key = %s" % (key)
            raiseSystemExit    
    
        to_write = ''
        for d in data:
            to_write = to_write + '%s\t' % (d)
        to_write = to_write + '%.1f\n' % (cov)   
        out_han.write(to_write)
        out_cntr += 1 

    in_han.close()
    out_han.close()
    print "Read %s intervals from %s" % (in_cntr,in_clin_file)
    print "Recorder %s intervals in output" % (out_cntr)
    print ""
    print "Output coverage file = %s" % (out_cov_file)
    print ""
    sys.stdout.flush()     


def read_coverage(in_file):

    in_han = open(in_file,'r')
    for line in in_han:

        if line.startswith('Target'):
            if line.endswith('%_above_20\n'):
                # header line, the last column is percenatge of bases covered > 20x    
                continue
            else:
                print "ERROR: found header line, but the last column is not bases above 20x"
                print line
                raise SystemExit

        data = [x.strip() for x in line.strip().split('\t')]  
        if len(data) != 9:
            print "ERROR: wrong number of columns in coverage file = %s" % (in_file)
            print line
            raise SystemExit

        locus = data[0]
        if data[8] == 'NaN':
            cov = float(0.0) 
        else:
            cov = float(data[8])

        chr,pos = locus.split(':')
        s,e = pos.split('-')
        start = int(s)-1  
        end = int(e)
        key = '%s:%s:%s' % (chr,start,end)
        if key not in COV_DICT:
            COV_DICT[key] = cov
        else:
            print "ERROR: key from GATK coverage file not unique = %s" % (key)
            raise SystemExit

    in_han.close()
    print "Recorded the coverage for %s unique keys" % (len(COV_DICT))
    sys.stdout.flush()

      
if __name__ == '__main__':
    if len(sys.argv) == 4:
        go(sys.argv[1],sys.argv[2],sys.argv[3])
    else:
        print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/generate_coverage_result_file.py \
                              DDG2P.s14-NFE-Twist-NA12878.sample_interval_summary \
                              /home/u035/u035/shared/resources/G2P/DDG2P.20180830.ClinVar.20190520.plus15bp.txt \
                              DDG2P.s14-NFE-Twist-NA12878.COV.txt"
        raise SystemExit