# given # <gene_set>.<ID>.sample_interval_summary - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt # <gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt # # generate a coverage output file # all the columns from the coverage file for this regions+ another column with the number of P/LP (ACP) ClinVar variants in this region # all the columns from the ClinVar file + another column with the percentage of bases covered at least 20x from the coverage file # # Author: MH # last modified: SEPT 26, 2019 import sys import os import csv import gzip COV_DICT = {} # key: 'chr:start-1:end'; value: coverage column (data[8]) def go(in_gatk_file,in_clin_file,out_cov_file): # read in the coverage file read_coverage(in_gatk_file) out_han = open(out_cov_file,'w') out_cntr = 0 out_han.write('chr\tstart\tend\tgene\tnum ClinVar P/LP (ACP)\tpercent bases covered > 20x\n') in_han = open(in_clin_file,'r') in_cntr = 0 for line in in_han: in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] start = int(data[1]) end = int(data[2]) key = '%s:%s:%s' % (chr,start,end) if key in COV_DICT: cov = COV_DICT[key] else: print "ERROR: cannot find the coverage for key = %s" % (key) raiseSystemExit to_write = '' for d in data: to_write = to_write + '%s\t' % (d) to_write = to_write + '%.1f\n' % (cov) out_han.write(to_write) out_cntr += 1 in_han.close() out_han.close() print "Read %s intervals from %s" % (in_cntr,in_clin_file) print "Recorder %s intervals in output" % (out_cntr) print "" print "Output coverage file = %s" % (out_cov_file) print "" sys.stdout.flush() def read_coverage(in_file): in_han = open(in_file,'r') for line in in_han: if line.startswith('Target'): if line.endswith('%_above_20\n'): # header line, the last column is percenatge of bases covered > 20x continue else: print "ERROR: found header line, but the last column is not bases above 20x" print line raise SystemExit data = [x.strip() for x in line.strip().split('\t')] if len(data) != 9: print "ERROR: wrong number of columns in coverage file = %s" % (in_file) print line raise SystemExit locus = data[0] if data[8] == 'NaN': cov = float(0.0) else: cov = float(data[8]) chr,pos = locus.split(':') s,e = pos.split('-') start = int(s)-1 end = int(e) key = '%s:%s:%s' % (chr,start,end) if key not in COV_DICT: COV_DICT[key] = cov else: print "ERROR: key from GATK coverage file not unique = %s" % (key) raise SystemExit in_han.close() print "Recorded the coverage for %s unique keys" % (len(COV_DICT)) sys.stdout.flush() if __name__ == '__main__': if len(sys.argv) == 4: go(sys.argv[1],sys.argv[2],sys.argv[3]) else: print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/generate_coverage_result_file.py \ DDG2P.s14-NFE-Twist-NA12878.sample_interval_summary \ /home/u035/u035/shared/resources/G2P/DDG2P.20180830.ClinVar.20190520.plus15bp.txt \ DDG2P.s14-NFE-Twist-NA12878.COV.txt" raise SystemExit