Skip to content
Snippets Groups Projects
generate_G2P_out_VCF.py 5.14 KiB
Newer Older
#	given the G2P FAM_ID.report.txt for a trio ran thru G2P
#	given an individual VCF file for a sample from this trio
#	extract a VCF file for the variants in this individual which are in the G2P output
#
#       Author: MH
#       last modified: APR 15, 2019



import sys
import os
import csv
import gzip



VAR_DICT = {}          # key: chr:start:end:ref:alt; value: irrelevant		- for variants found in G2P output for this sample_id
NUM_UNIQ_VARS = 0


def go(in_vcf_file,in_g2p_file,sample_id,out_vcf_file):

    # read the G2P output for this sample_id
    read_G2P(in_g2p_file,sample_id)


    # now read the input VCF, check if the variant in the G2P output: if yes - put it in the out file

    in_cntr = 0
    out_cntr = 0

    in_han = gzip.open(in_vcf_file,'r')
    out_han = open(out_vcf_file,'w')
    
    for line in in_han:
        if line.startswith('#'):
            out_han.write(line) 
            continue

        in_cntr += 1

        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = int(data[1])
        ref = data[3]
        alt = data[4]


        # did the splitting and normalizing - should noe have multiallelic variants
        if alt.find(',') != -1:
            print "ERROR: found multiallelic variant"
            print line
            raiseSystemExit


        ##############################################################
        # different processing depending on being a SNP, INS, or DEL #
        ##############################################################
         
        if len(ref) == len(alt):			# SNP
            if len(ref) != 1:
                print "ERROR: MNPs are not supported!"
                print line
                raise SystemExit

            key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
            if key_to_match in VAR_DICT:
                out_cntr += 1
                out_han.write(line)

        elif len(ref) > len(alt):			# DEL
            if len(alt) != 1:
                print "ERROR with a deletion"
                print line
                raise SystemExit 

            key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
            if key_to_match in VAR_DICT:
                out_cntr += 1
                out_han.write(line)

        elif len(ref) < len(alt):                       # INS
            if len(ref) != 1:
                print "ERROR with an insertion"
                print line
                raise SystemExit

            key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
            if key_to_match in VAR_DICT:
                out_cntr += 1
                out_han.write(line)

        else:
            print "Cannot establish the type of this VCF variant"
            print line
            raise SystemExit 



    in_han.close()
    out_han.close()
    print "read %s VCF vars for %s, wrote %s G2P vars in %s" % (in_cntr,sample_id,out_cntr,out_vcf_file)  
    if out_cntr == NUM_UNIQ_VARS:
        print "OK: All G2P vars are recorded in the G2P VCF file"
    else:
        print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file" 




def read_G2P(in_file,sample_id):
    global NUM_UNIQ_VARS
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]

        # ignore variants in other memebers of the trio 
        sam_id = data[0]
        if sam_id != sample_id:
            continue

        var_list = [y.strip() for y in data[6].split(';')]
         
        for v in var_list:
            v_details = [z.strip() for z in v.split(':')]
            chr = v_details[0]
            start = int(v_details[1])
            end = int(v_details[2])
            ref = v_details[3]
            alt = v_details[4]

            if len(ref) > 1 and len(alt) > 1:																# an INDEL - not normalized
                if len(ref) < len(alt):																	# an INS
                    orig_start = start
                    orig_ref = ref
                    orig_alt = alt 
                    start = orig_start
                    ref = '-'
                    alt = orig_alt[len(orig_ref):]
                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)  
                else:																			# a DEL
                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                    print line
                    raise SystemExit 

            key = '%s:%s:%s:%s' % (chr,start,ref,alt)
            
            if key not in VAR_DICT:
                VAR_DICT[key] = 1
            else:
                pass 

    in_han.close()
    NUM_UNIQ_VARS = len(VAR_DICT)
    print "Found %s unique G2P variants for %s" % (NUM_UNIQ_VARS,sample_id)
    sys.stdout.flush()






# def go(in_vcf_file,in_g2p_file,sample_id,out_vcf_file):

if __name__ == '__main__':
    if len(sys.argv) == 5:
        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/generate_G2P_out_VCF.py 2820-gatk-haplotype-annotated.2820_2820.vcf.gz ../output_dd/2820_log_dir/2820.report.txt 2820_2820 2820_2820.G2P.vcf "
        raise SystemExit