# given the G2P FAM_ID.report.txt for a trio ran thru G2P # given an individual VCF file for a sample from this trio # extract a VCF file for the variants in this individual which are in the G2P output # # Author: MH # last modified: APR 15, 2019 import sys import os import csv import gzip VAR_DICT = {} # key: chr:start:end:ref:alt; value: irrelevant - for variants found in G2P output for this sample_id NUM_UNIQ_VARS = 0 def go(in_vcf_file,in_g2p_file,sample_id,out_vcf_file): # read the G2P output for this sample_id read_G2P(in_g2p_file,sample_id) # now read the input VCF, check if the variant in the G2P output: if yes - put it in the out file in_cntr = 0 out_cntr = 0 in_han = gzip.open(in_vcf_file,'r') out_han = open(out_vcf_file,'w') for line in in_han: if line.startswith('#'): out_han.write(line) continue in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # did the splitting and normalizing - should noe have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" print line raiseSystemExit ############################################################## # different processing depending on being a SNP, INS, or DEL # ############################################################## if len(ref) == len(alt): # SNP if len(ref) != 1: print "ERROR: MNPs are not supported!" print line raise SystemExit key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key_to_match in VAR_DICT: out_cntr += 1 out_han.write(line) elif len(ref) > len(alt): # DEL if len(alt) != 1: print "ERROR with a deletion" print line raise SystemExit key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:]) if key_to_match in VAR_DICT: out_cntr += 1 out_han.write(line) elif len(ref) < len(alt): # INS if len(ref) != 1: print "ERROR with an insertion" print line raise SystemExit key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:]) if key_to_match in VAR_DICT: out_cntr += 1 out_han.write(line) else: print "Cannot establish the type of this VCF variant" print line raise SystemExit in_han.close() out_han.close() print "read %s VCF vars for %s, wrote %s G2P vars in %s" % (in_cntr,sample_id,out_cntr,out_vcf_file) if out_cntr == NUM_UNIQ_VARS: print "OK: All G2P vars are recorded in the G2P VCF file" else: print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file" def read_G2P(in_file,sample_id): global NUM_UNIQ_VARS in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # ignore variants in other memebers of the trio sam_id = data[0] if sam_id != sample_id: continue var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit key = '%s:%s:%s:%s' % (chr,start,ref,alt) if key not in VAR_DICT: VAR_DICT[key] = 1 else: pass in_han.close() NUM_UNIQ_VARS = len(VAR_DICT) print "Found %s unique G2P variants for %s" % (NUM_UNIQ_VARS,sample_id) sys.stdout.flush() # def go(in_vcf_file,in_g2p_file,sample_id,out_vcf_file): if __name__ == '__main__': if len(sys.argv) == 5: go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/generate_G2P_out_VCF.py 2820-gatk-haplotype-annotated.2820_2820.vcf.gz ../output_dd/2820_log_dir/2820.report.txt 2820_2820 2820_2820.G2P.vcf " raise SystemExit